SENDAs Agreement 1 Update 2010-2022 (step 2)
First step of deduplication process. Exploratory data analysis was conducted, addressing issues such as data entry errors, missing values, and the conversion of the date of birth into the age at the time of the first discharge for each individual.
Data Loading and Exploration
Loading Packages and uniting databases
Proceed to load the necessary packages.
Code
invisible("Only run from Ubuntu")
if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
if(Sys.info()["sysname"]!="Windows"){
Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
}
}
#clean enviroment
rm(list = ls()); gc()
file.path(paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))),"data/20241015_out"))
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
time_before_dedup2<-Sys.time()
#base::load(paste0(wdpath,"data/20241015_out/","3_ndp_2025_05_30.Rdata"))
if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
file.path(paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))),"data/20241015_out"))
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
base::load(paste0(wdpath,"data/20241015_out/","3_ndp_2025_06_02.Rdata"))
} else {
file.path(paste0(getwd(),"/_input"))
paste0(getwd(),"/_input","/3_ndp_2025_06_02.Rdata")
base::load(paste0(getwd(),"/_input","/3_ndp_2025_06_02.Rdata.enc"))
}
time_before_dedup1<-Sys.time()
password <- Sys.getenv("PASS_PPIO")
system(sprintf("7z x path/to/_input/3_ndp_2025_06_02.Rdata.7z.001 -p'%s'", password)) used (Mb) gc trigger (Mb) max used (Mb)
Ncells 605956 32.4 1279675 68.4 1086641 58.1
Vcells 1221570 9.4 8388608 64.0 2106365 16.1
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
[1] 127
Code
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)
#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")
#check if rstools is installed
try(installr::install.Rtools(check_r_update=F))Code
if(quarto::quarto_version()<"1.7.29"){
stop("You need to install a recent quarto version") # la publicada el 28-abr-2025
}
#change repository to CL
local({
r <- getOption("repos")
r["CRAN"] <- "https://cran.dcc.uchile.cl/"
options(repos=r)
})
if(!require(pacman)){install.packages("pacman");require(pacman)}Code
if(!require(pak)){install.packages("pak");require(pak)}Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetesCode
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}
#check docker
check_docker_running <- function() {
# Try running 'docker info' to check if Docker is running
system("docker info", intern = TRUE, ignore.stderr = TRUE)
}
install_docker <- function() {
# Open the Docker Desktop download page in the browser for installation
browseURL("https://www.docker.com/products/docker-desktop")
}
# Main logic
if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
liftr::install_docker()
} else {
message("Docker is running.")
}Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1
Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
unlink("*_cache", recursive=T)
# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------
paks <-
c(#"git",
# To connect to github
"gh", #interface for GitHub API from R
#
"gitcreds", # manages Git credentials (usernames, passwords, tokens)
#
"usethis", # simplifies common project setup tasks for R developers
# Package to bring packages in development
"devtools",
# Package administration
"renv",
# To manipulate data
"knitr", "pander", "DT",
# Join
"fuzzyjoin", "RecordLinkage",
# For tables
"tidyverse", "janitor",
# For contingency tables
"kableExtra",
# For connections with python
"reticulate",
# To manipulate big data
"polars", "sqldf",
# To bring big databases
"nanoparquet",
# Interface for R and RStudio in R
"installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
# Time handling
"clock",
# Combine plots
"ggpubr",
# Parallelized iterative processing
"furrr",
# Work like a tibble with a data.table database
"tidytable",
# Split database into training and testing
"caret",
# Impute missing data
"missRanger", "mice",
# To modularize tasks
"job",
# For PhantomJS install checks
"webshot"
)
# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf
# pak::pkg_install(paks)
pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)), "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()
library(tidytable)Code
library(polars)Warning: package ‘polars’ was built under R version 4.4.3
Code
library(ggplot2)
library(readr)
# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
try(polars_code_completion_activate())Code
# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
devtools::install_github("bergant/bpmn")
}
# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
# if (!webshot::is_phantomjs_installed()) {
# webshot::install_phantomjs()
# }
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
copiar_nombres <- function(x,row.names=FALSE,col.names=TRUE,dec=",",...) {
if(class(try(dplyr::ungroup(x)))[1]=="tbl_df"){
if(options()$OutDec=="."){
options(OutDec = dec)
write.table(format(data.frame(x)),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
options(OutDec = ".")
return(x)
} else {
options(OutDec = ",")
write.table(format(data.frame(x)),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
options(OutDec = ",")
return(x)
}
} else {
if(options()$OutDec=="."){
options(OutDec = dec)
write.table(format(x),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
options(OutDec = ".")
return(x)
} else {
options(OutDec = ",")
write.table(format(x),"clipboard",sep="\t",row.names=FALSE,col.names=col.names,...)
options(OutDec = ",")
return(x)
}
}
}
#WINDOWS do not restrict memory size
if(.Platform$OS.type == "windows") withAutoprint({
memory.size()
memory.size(TRUE)
memory.limit()
})Warning: ‘memory.size()’ is no longer supported
Warning: ‘memory.size()’ is no longer supported
Warning: ‘memory.limit()’ is no longer supported
Code
memory.limit(size=56000)Warning: ‘memory.limit()’ is no longer supported
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')
pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#
#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){
# select the correct markup
# one * for italics, two ** for bold
map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
markup <- map[value]
for (r in rows){
for(c in cols){
# Make sure values are not factors
df[[c]] <- as.character( df[[c]])
# Update formatting
df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
}
}
return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
error = function(x, options) {
paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
'</div>', sep = '\n')
},
warning = function(x, options) {
paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
'</div>', sep = '\n')
},
message = function(x, options) {
paste('<div class="message" style="font-size: small !important;">',
gsub('##', '\n', x),
'</div>', sep = '\n')
}
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
sum_dates <- function(x){
cbind.data.frame(
min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
)
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
# Create the list of quantiles
quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
# Create expressions to calculate min and max
expr_list <- list(
pl$col(date_col)$min()$alias("min"),
pl$col(date_col)$max()$alias("max")
)
# Add expressions for quantiles
for (q in quantiles) {
expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
}
# Apply the expressions and return a DataFrame with the results
df$select(expr_list)
}
# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
set.seed(seed)
dplyr::sample_n(data, size)
}
# Function to get the most frequent value
most_frequent <- function(x) {
uniq_vals <- unique(x)
freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
if (length(most_freq) == 1) {
return(most_freq)
} else {
return(NA)
}
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
options(scipen=2) #display numbers rather scientific number
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Define the function first
#oins these values with semicolons and optionally truncates the result if it exceeds a specified width.
toString2 <- function(x, width = NULL, ...) {
string <- paste(x, collapse = "; ")
if (missing(width) || is.null(width) || width == 0)
return(string)
if (width < 0)
stop("'width' must be positive")
if (nchar(string, type = "w") > width) {
width <- max(6, width)
string <- paste0(substr(string, 1, width - 3), "...")
}
string
}Error in contrib.url(repos, "source") :
trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr 3.7.6
- cli 3.6.2
- curl 5.2.1
- desc 1.4.3
- filelock 1.0.3
- jsonlite 1.8.8
- lpSolve 5.6.23.9000
- pkgbuild 1.4.4
- pkgcache 2.2.2.9000
- pkgdepends 0.7.2.9000
- pkgsearch 3.1.3.9000
- processx 3.8.4
- ps 1.7.6
- R6 2.5.1
- zip 2.3.1
* Dependencies can be loaded
> memory.size()
[1] Inf
> memory.size(TRUE)
[1] Inf
> memory.limit()
[1] Inf
[1] Inf
To assess the main goals of the study, we first focused on distinguishing each user across the yearly datasets obtained from SENDA (1). Next, we separated each user’s treatments (2). Finally, we normalized, standardized, and cleaned each treatment (3). Although these stages may appear conceptually separate and sequential, they are interdependent (e.g., some variables needed to be standardized to identify duplicate entries).
Throughout this document, we use the terms “rows”, “cases”, “observations” or “treatment episodes” interchangeably to refer to entries in the dataset.
The previous document revealed overlapping cases and nearly identical records, as well as patients with unfinished treatments (i.e., missing discharge dates in the 2018-2019 databases) who might have received subsequent overlapping treatments.
pre-0. Missing discharge dates due to truncation in dataset retrieval
We observe that there are some cases with missing treatment discharge dates, as if they were still ongoing, because the responsible institution provided us with a database only up to the date of submission of the previous project (namely, November 2019). Therefore, the conclusion of those treatments (whether dropout, administrative discharge, or therapeutic discharge) cannot be determined. We also restricted these observations if they had a treatment completion status of “currently in”, as there were a few observations with days in admission that generated discharge dates previous to 2019 with a finished record.
Code
SISTRAT23_c1_2010_2022_df_prev1g|>
filter(is.na(disch_date_num)) |>
mutate(disch_date_na= as.Date(adm_date_rec_num+ dias_en_tratamiento, origin = "1970-01-01")) |>
select(TABLE_rec, rn, hash_key, dias_en_tratamiento, adm_age_rec, adm_date_rec, disch_date_na, id_centro, tr_compliance, plan_type, senda)|>
filter(disch_date_na<"2023-04-28" & grepl("currently",tr_compliance))|>
(\(df) {
cat(paste0("00. Missing discharge dates due to truncation, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("00. Missing discharge dates due to truncation, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
distinct(df, hash_key)|> pull(hash_key) ->> hash_truncated_treatments_due_to_retrieval_2019
df|> pull(rn) ->> rows_truncated_treatments_due_to_retrieval_2019
df
})()|>
#View()
filter(hash_key %in% (sample_n_with_seed(data.frame(hash_truncated_treatments_due_to_retrieval_2019),20, seed=2125)|> pull(1)))|>
mutate(hash_key= as.numeric(factor(hash_key)))|>
knitr::kable("markdown", caption= "Missing discharge dates due to administrative truncation (sample)")00. Missing discharge dates due to truncation, cases: 876
00. Missing discharge dates due to truncation, RUNs: 875
| TABLE_rec | rn | hash_key | dias_en_tratamiento | adm_age_rec | adm_date_rec | disch_date_na | id_centro | tr_compliance | plan_type | senda |
|---|---|---|---|---|---|---|---|---|---|---|
| 20191 | 165217 | 1 | 205 | 33.17 | 2019-04-22 | 2019-11-13 | 415 | currently in | pg-pab | si |
| 20191 | 166554 | 2 | 197 | 27.61 | 2019-04-30 | 2019-11-13 | 109 | currently in | pg-pai | si |
| 20151 | 73018 | 3 | 1734 | 38.89 | 2015-02-05 | 2019-11-05 | 139 | currently in | pg-pab | si |
| 20191 | 168790 | 4 | 114 | 45.00 | 2019-07-22 | 2019-11-13 | 497 | currently in | pg-pai | si |
| 20191 | 170585 | 5 | 104 | 51.37 | 2019-08-01 | 2019-11-13 | 667 | currently in | pg-pai | si |
| 20191 | 169192 | 6 | 141 | 29.24 | 2019-06-25 | 2019-11-13 | 489 | currently in | pg-pab | si |
| 20191 | 155751 | 7 | 723 | 53.68 | 2017-11-20 | 2019-11-13 | 628 | currently in | pg-pab | si |
| 20191 | 168082 | 8 | 162 | 43.62 | 2019-06-04 | 2019-11-13 | 432 | currently in | m-pr | si |
| 20191 | 155164 | 9 | 1336 | 35.78 | 2016-03-17 | 2019-11-13 | 146 | currently in | pg-pai | si |
| 20191 | 156096 | 10 | 627 | 31.05 | 2018-02-24 | 2019-11-13 | 625 | currently in | pg-pab | si |
| 2011 | 20203 | 11 | 2943 | 38.85 | 2011-10-15 | 2019-11-05 | 269 | currently in | pg-pr | no |
| 20161 | 88146 | 12 | 1610 | 26.49 | 2015-06-09 | 2019-11-05 | 556 | currently in | pg-pai | si |
| 20181 | 126990 | 13 | 979 | 20.24 | 2017-03-01 | 2019-11-05 | 307 | currently in | pg-pr | si |
| 20191 | 168526 | 14 | 127 | 18.37 | 2019-07-09 | 2019-11-13 | 614 | currently in | pg-pab | si |
| 20181 | 130398 | 15 | 762 | 26.70 | 2017-10-04 | 2019-11-05 | 239 | currently in | pg-pai | si |
| 2010 | 5236 | 16 | 3906 | 31.65 | 2009-02-24 | 2019-11-05 | 280 | currently in | pg-pab | si |
| 20161 | 88300 | 17 | 1572 | 16.25 | 2015-07-17 | 2019-11-05 | 291 | currently in | pg-pr | no |
| 20191 | 170097 | 18 | 91 | 49.82 | 2019-08-14 | 2019-11-13 | 612 | currently in | pg-pab | si |
| 20191 | 169808 | 19 | 99 | 58.67 | 2019-08-06 | 2019-11-13 | 489 | currently in | pg-pab | si |
| 20191 | 160795 | 20 | 330 | 18.01 | 2018-12-18 | 2019-11-13 | 138 | currently in | pg-pai | si |
However, it is important to note that if those treatments had continued, they would appear in the following year’s database. We successfully imported an updated 2019 database and attempted to standardize it according to the formatting applied before the initial step of the deduplication phase, as documented on June 2, 2025. For the remaining cases, and while we attempt to obtain additional complementary databases, a discharge date of December 31, 2019, was imputed.
After the imputation, we corrected cases by creating a join key (concat) by combining each patient’s hash_key with their admission date (adm_date_rec), then merging the main dataset (SISTRAT23_c1_2010_2022_df_prev1h) with the updated discharge information provided by SENDA professionals by that key (parsing the joined discharge date into a proper Date object). After selecting the relevant variables, we grouped the data by hash_key and, for any group with exactly one record whose original disch_date_rec0 equals December 31, 2019, assigns the actual parsed disch date to a new column (new_disch); all other rows receive NA. Finally, it ungroups and filters to keep only those rows where new_disch is not missing. This criteria responded to the request that was based in the HASH key only (not in the combination with admission date).
Code
X2019_2019dup_encrip <- readr::read_delim("G:/My Drive/Alvacast/SISTRAT 2023/data/20250508_original_data/2019_2019dup_encrip.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE,
locale = locale(decimal_mark = ",", grouping_mark = ".", tz = "America/Santiago",
encoding = "latin1"),#encoding = "ISO-8859-1"),
na = c("", "NA","null"),
guess_max = min(1e5, Inf)) |> janitor::clean_names()|>
mutate(cod_indentificacion= tolower(codigo_identificaci_a_a3n))Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Format as most as possible the updated 2019 database in terms of dates")
df2019_mod_maureen <-
X2019_2019dup_encrip|>
filter(hashkey %in% hash_truncated_treatments_due_to_retrieval_2019
)|>
#select(TABLE, hash_key, adm_date_rec, disch_date)
select(hashkey, diasen_tratamiento, fecha_ingresoa_tratamiento, fecha_egresode_tratamiento, motivode_egreso) |>
mutate(discharge_date= stringr::str_replace_all(fecha_egresode_tratamiento,"/","-"))%>%
mutate(discharge_date= readr::parse_date(discharge_date, format="%d-%m-%Y")) |>
mutate(adm_date_orig= stringr::str_replace_all(fecha_ingresoa_tratamiento,"/","-"))%>%
mutate(adm_date_orig= readr::parse_date(adm_date_orig, format="%d-%m-%Y"))|>
mutate(discharge_date= as.character(discharge_date))|>
tidytable::mutate(discharge_date = tidytable::case_when(
#rn== 1294
hashkey=="c4795829b6ea9cfc50b988c85deb391fa041d99a0ebca6b68a1378f37e3eb420" & adm_date_orig=="2009-06-30" ~ "2019-12-30",
#rn== 1934
hashkey=="23874d59570adaac6690c85481b869570c10c2f8931fc20636037cdff04af067" & adm_date_orig=="2008-07-02" ~ "2009-05-13",
#rn== 1938
hashkey=="5a16413f76625a09585c89fd3ea4fb05d1ea5cbfbc18247a9fb6e7e21534562d" & adm_date_orig=="2008-07-23" ~ "2009-04-14",
#rn== 2602
hashkey=="11b143acdce4bf1d3a72acd4a703ea8c38543fd02585b4f3b0433e227929ed3c" & adm_date_orig=="2008-03-04" ~ "2009-09-15",
#rn== 2603
hashkey=="986ded00e6ca834805a169ed528655e22f819bf5104d1729b2e1453f20f38065" & adm_date_orig=="2008-12-05" ~ "2009-06-02",
#rn== 2604
hashkey=="d402a1e13f25b2411ca346b0dc84b9fffa45887e628abf09262777b6deae85aa" & adm_date_orig=="2009-06-09" ~ "2009-06-09",
#rn== 2896
hashkey=="0d248b372c7224ae2cc1cabb750d6201150175b5d65ec0397ff2127d32b6b675" & adm_date_orig=="2009-02-05" ~ "2009-03-09",
#rn== 3198
hashkey== "6eb67e1ead556eb1dbd21951747440057a17a872b33b468a37c9bf781219cef8" & adm_date_orig=="2009-10-07" ~ "2010-04-10",
#rn== 3260
hashkey=="e0acff1477306ee93abfca7e251cc6d23db916b390a9fe506fbbefc371ce1d43" & adm_date_orig=="2009-12-07" ~ "2010-06-01",
#rn== 5175
hashkey=="eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date_orig=="2010-04-26" ~ "2011-05-03",
#rn== 5760
hashkey=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date_orig=="2010-05-03" ~ "2010-10-04", #rn== 6354
hashkey=="4d42363412d6a435dd2762bbee7f9b4fe4117ff4c94d55e10472342156238ccb" & adm_date_orig=="2010-06-17" ~ "2010-07-01",
#rn== 5760
hashkey=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date_orig=="2010-05-03" ~ "2010-10-04",
#rn== 8176
hashkey=="228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date_orig=="2010-07-01" ~ "2011-01-13",
#rn== 8756
hashkey=="7ebe4155bb7741beef0f30ce47ecbc735bd1f7137d22e81ba21d5f12f8398fa2" & adm_date_orig=="2010-10-04" ~ "2011-01-31",
#rn== 5760
hashkey=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date_orig=="2010-05-03" ~ "2010-10-04",
#rn== 9092
hashkey=="93478aa27b121dbad91cb8e36ef60caa42fce6ca5b99478a77e9b8478df600f3" & adm_date_orig=="2010-11-23" ~ "2011-01-14",
#rn== 9171
hashkey=="6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date_orig=="2010-11-25" ~ "2011-06-10",
#rn== 9177
hashkey=="4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date_orig=="2010-11-25" ~ "2011-06-20",
#rn== 9444
hashkey=="1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date_orig=="2010-12-02" ~ "2011-06-10",
#rn== 10424
hashkey=="eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date_orig=="2010-04-26" ~ "2011-05-03",
#rn== 11482
hashkey=="228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date_orig=="2010-07-01" ~ "2011-01-13",
#rn== 12097
hashkey=="6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date_orig=="2010-11-25" ~ "2011-06-10", #rn== 12102
hashkey=="4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date_orig=="2010-11-25" ~ "2011-06-20", #rn== 12301
hashkey=="1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date_orig=="2010-12-02" ~ "2011-06-10", #rn== 13086
hashkey=="c75bb8c43963dbad7a1b311497073a58b0e97bb82c5c63a4bc7ae4d1c9014592" & adm_date_orig=="2011-01-13" ~ "2011-07-10",
#rn== 13644
hashkey=="f40999d751e9eb84f5ed6d832d96a1de872599c181e28dd420507c58d7464ccf" & adm_date_orig=="2011-02-08" ~ "2011-08-04",
#rn== 14099
hashkey=="dbe7ddec7591332da15c3c4a1d4a2a1559d455a67b6c31a390ea546ea259c045" & adm_date_orig=="2011-02-10" ~ "2011-05-03",
#rn== 14339
hashkey=="05ff2bf96ef3a294c09b39cf91c19f7a74b080487f13f62c449812f14cefff37" & adm_date_orig=="2011-03-22" ~ "2011-07-31",
#rn== 15403
hashkey=="bdf81829448433489a21d8ac17de96f3765707798d8e2beb7653414f43f272aa" & adm_date_orig=="2011-04-15" ~ "2011-06-12",
#rn== 16016
hashkey=="0bd45263c5217ae4324c23ca4bfec945d4100276fcac4e3e66ad5b6f5341d3fd" & adm_date_orig=="2011-05-20" ~ "2011-06-01",
#rn== 16150
hashkey=="d6d0aaa21c50981871615a6b8886d1f69a3d0f125165f63f6a1c54729be5eea2" & adm_date_orig=="2011-05-23" ~ "2011-06-05",
#rn== 16413
hashkey=="4728851a593a1490d73682e45945fe0f253d0f18dfc12aa1d2d21deef206c39c" & adm_date_orig=="2011-04-18" ~ "2011-08-30",
#rn== 16742
hashkey=="caafb47faaab3c9637821a50ce4dcef33b8e3a9fc275f0ef76f0c93681eb15ba" & adm_date_orig=="2011-06-06" ~ "2011-07-04",
#rn== 16745
hashkey=="18096679bef8db59dbd0ca3be91fa36d7d9dcbbf06b85be2662f410d0146d1a2" & adm_date_orig=="2011-06-17" ~ "2011-07-31",
#rn== 16755
hashkey=="40d3ff594c6c3ddd96e37e5e53fbd22030916a99a4f04cf6283ad188058f2a5b" & adm_date_orig=="2011-06-23" ~ "2011-07-07",
#rn== 17500
hashkey=="667766680894eb203756044682c8445365bb0a831012ec49341b080390133d5d" & adm_date_orig=="2011-06-20" ~ "2011-08-02",
#rn== 30449
hashkey=="60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date_orig=="2012-10-16" ~ "2013-08-13",
#rn== 34193
hashkey=="60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date_orig=="2012-10-16" ~ "2013-08-13",
#rn== 35638
hashkey=="08a5dc9a016c0525d7ceea954a8078391701ea9743b71bc2a012f0949952029f" & adm_date_orig=="2013-01-07" ~ "2013-07-17",
#rn== 36161
hashkey=="71049ebb5d958e0647c01c4398c91ff3e02275f7dc5e2fefee5bc263a7653c96" & adm_date_orig=="2013-01-28" ~ "2013-08-12",
#rn== 36415
hashkey=="52e218f6406835e8624ffe71595152560ec44a02a7580d673019eefa88df7a61" & adm_date_orig=="2013-01-29" ~ "2013-04-02",
#rn== 37116
hashkey=="22c282462adfb8e48b3a6b697d533244c9c656a6b31ff87d0180679d9f5ce98d" & adm_date_orig=="2013-02-08" ~ "2013-08-02",
#rn== 37958
hashkey=="221d71ae6c4dba4aee931b3ee518d47fd3972fed3fbf7f4d44c676bedca786c4" & adm_date_orig=="2013-03-18" ~ "2013-07-10",
#rn== 38907
hashkey=="877ea9b68dde038d9f63d04d4e65d1eb27ac3f46af22e310c7c2114feb7f871b" & adm_date_orig=="2013-04-18" ~ "2013-07-31",
#rn== 38908
hashkey=="14af0ddf318fb49877b16491b0fb7df491d98bd32dd854bdbec526f898dd9946" & adm_date_orig=="2013-04-18" ~ "2013-06-17",
#rn== 38909
hashkey=="243a1044f746ae87432532552b4b93b6978fb3b18fa3a4305a11b2af698eb013" & adm_date_orig=="2013-04-16" ~ "2013-07-27",
#rn== 39617
hashkey=="0e729e637c95d5d4486a7f822d14f0f1925ac358fff61d9bba9d7407b8e9abe7" & adm_date_orig=="2013-04-29" ~ "2013-07-25",
#rn== 39618
hashkey=="289a7b6c884980dc60c9171bb05939bacf18a62551ebda723af75cbfc8308db9" & adm_date_orig=="2013-05-08" ~ "2013-07-14",
#rn== 39620
hashkey=="cde086d548022a94e623bfc3d6b34202b28141ed2134ba35425ce4807e75f2fb" & adm_date_orig=="2013-04-29" ~ "2013-07-02",
#rn== 40045
hashkey=="10fc40384411161967b222bf530a0378e0ae585bd69370d57d9c4fb49a1a34c3" & adm_date_orig=="2013-05-22" ~ "2013-08-02",
#rn== 40293
hashkey=="67353760ae53ad8963176af0ec6cab9c4bdad13b9e53058e68e53f80b409b224" & adm_date_orig=="2013-05-29" ~ "2013-08-07",
#rn== 40599
hashkey=="3ce639d4d0330242d1f7c1e6496e834ad3fa2b41bef89b09bc373e9dede8c981" & adm_date_orig=="2013-05-02" ~ "2013-07-03",
#rn== 41114
hashkey=="5e6d9dcec9e717d4536f7cfa5cc0f713e7c2c7933058aeb9a37fec0a24da5151" & adm_date_orig=="2013-06-06" ~ "2013-07-31",
#rn== 41117
hashkey=="e01e3218ba73e9d26178e7a6aceb86357695bc88117f1d7b89c8adbf55210528" & adm_date_orig=="2013-06-05" ~ "2013-06-27",
#rn== 42456
hashkey=="421abbc2c85687aa87adec1c3146debf5ddea3ea71f65d708c2cf4d4dde86e38" & adm_date_orig=="2013-07-02" ~ "2013-07-08",
#rn== 42633
hashkey=="567f1fd735550a9bc1a2ea8a838d87b69369caa106c2d0cd0a1b38581d09919f" & adm_date_orig=="2013-07-09" ~ "2013-08-16",
#rn== 42634
hashkey=="7f259b5289b209cc669db813abfcd14519a21c4f69aaeb0190f094c61a52afad" & adm_date_orig=="2013-06-28" ~ "2013-07-09",
#rn== 42854
hashkey=="49cca05a51baac5c836a053eac96674c775e2d7164209a04f09f8da34952b789" & adm_date_orig=="2013-07-02" ~ "2013-08-02",
#rn== 43076
hashkey=="6adbbaff91e32138777abcf66a161d953722255c88368f9a5877d1ddfa48decd" & adm_date_orig=="2013-08-06" ~ "2013-08-20",
#rn== 43181
hashkey=="02c866ee44e5a3a310cf18728753e3a4c3751d4ea4d61edc22d78606cde0fcc8" & adm_date_orig=="2013-08-01" ~ "2013-08-16",
#rn== 43182
hashkey=="506be60207917af56fa39175f11ee5b3b874c0883245e37d0b2a79e0b24f08ad" & adm_date_orig=="2013-08-01" ~ "2013-08-22",
TRUE ~ as.character(discharge_date)
))|>
tidytable::mutate(discharge_date= readr::parse_date(discharge_date, format="%Y-%m-%d"), motivode_egreso= tolower(motivode_egreso))|>
#Early vs. late dropout
tidytable::mutate(dit_earl_drop= ifelse(diasen_tratamiento>=90 & !is.na(diasen_tratamiento),0,1))|>
#changed the order of the labels
tidytable::mutate(dit_earl_drop= factor(dit_earl_drop, labels=c(">= 90 days","<90 days")))|> #t.test(dit_rec~ dit_earl_drop, data= df)
tidytable::mutate(
tr_compliance = case_when(
grepl("<", dit_earl_drop) & grepl("abando",motivode_egreso) ~ "early dropout",
grepl(">", dit_earl_drop) & grepl("abando",motivode_egreso) ~ "late dropout",
grepl("<", dit_earl_drop) & grepl("adm", motivode_egreso) ~ "early adm discharge",
grepl(">", dit_earl_drop) & grepl("adm", motivode_egreso) ~ "late adm discharge",
grepl("alta ter", motivode_egreso) ~ "completion",
motivode_egreso == "muerte" ~ "death",
grepl("derivac", motivode_egreso) ~ "referral",
is.na(motivode_egreso) ~ "currently in",
TRUE ~ "other"
)
) #|> janitor::tabyl(tr_compliance)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
SISTRAT23_c1_2010_2022_df_prev1g|>
filter(is.na(disch_date_num)) |>
mutate(disch_date_na= as.Date(adm_date_rec_num+ dias_en_tratamiento, origin = "1970-01-01")) |>
(\(df){
cat("Table of dates of discharge with days in treatment\n")
print(table(df$disch_date_na))
cat("We should discard dates previous to 2023-04-28 because they were part of the actuala dministrative truncation process\n\n")
df
})() |>
select(TABLE_rec, rn, hash_key, dias_en_tratamiento, adm_age_rec, adm_date_rec, disch_date, disch_date_na, id_centro, tr_compliance, plan_type, senda) |>
#filter(disch_date_na<"2023-04-28" & grepl("currently",tr_compliance)) |>
filter(disch_date_na<"2023-04-28")|>
#Take only
inner_join(df2019_mod_maureen, by= c("hash_key"="hashkey", "adm_date_rec"="adm_date_orig")) |>
select(-fecha_ingresoa_tratamiento, -fecha_egresode_tratamiento)|>
(\(df) {
cat(paste0("New discharge date from updated C1 2019 database (discarding discharges in 2024-04-28; COINCIDENCE by HASH & admission date), cases: ", formatC(nrow(filter(df, !is.na(discharge_date))), big.mark=",")),"\n")
cat(paste0("New discharge date from updated C1 2019 database (discarding discharges in 2024-04-28; COINCIDENCE by HASH & admission date), RUNs: ", formatC(nrow(distinct(filter(df, !is.na(discharge_date)), hash_key)), big.mark=","),"\n\n"))
#export the records with discharge dates that didnt have earlier
base::subset(df,
subset = !is.na(discharge_date),
select = c("rn","hash_key", "adm_date_rec", "discharge_date", "tr_compliance.y")) ->> hashs_dates_updated_disch_date
cat("Lets check yearly database origin. Where do they come from?...\n")
print(janitor::tabyl(df, TABLE_rec))
})()
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("Corrected the database with updated discharge dates")
colnames(hashs_dates_updated_disch_date) <- c("rny", "hash_key", "date_adm", "date_disch", "tr_comp")
SISTRAT23_c1_2010_2022_df_prev1h<-
SISTRAT23_c1_2010_2022_df_prev1g|>
(\(df) {
cat(paste0("4.pre. Database before correcting discharge dates, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.pre. Database before discarding discharge dates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
#left_join(hashs_dates_updated_disch_date, by=c("hash_key"="hash_key", "adm_date_rec"="date_adm"))|>
left_join(hashs_dates_updated_disch_date[,c("rny", "date_disch", "tr_comp")], by=c("rn"="rny"))|>
(\(df) {
if (nrow(df)> nrow(SISTRAT23_c1_2010_2022_df_prev1g))stop("Error: Added treatment episodes in the process")
df
})()|>
mutate(date_disch_num= unclass(date_disch))|>
#If it has an updated discharge date, replace with this; if not and applies, add Dec 31th, 2019. If not, preserve date.
mutate(disch_date_rec0_num= case_when(rn %in% hashs_dates_updated_disch_date$rny~ date_disch_num, rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ 18261, T~ disch_date_num))|>
mutate(disch_date_rec0= case_when(rn %in% hashs_dates_updated_disch_date$rny~ date_disch, rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ as.Date("2019-12-31"), T~disch_date))|>
mutate(dit_rec1= case_when(rn %in% hashs_dates_updated_disch_date$rny~ (date_disch_num- adm_date_rec_num), rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ (disch_date_rec0_num- adm_date_rec_num), T~ dit_rec))|>
#added the updated status; if not available, we imputed the "adm truncated"
#the rest keeps their tr. compliance status
mutate(tr_compliance_rec= case_when(rn %in% hashs_dates_updated_disch_date$rny~ tr_comp, rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ "adm truncated", T~tr_compliance))|>
mutate(OBS= case_when(rn %in% hashs_dates_updated_disch_date$rny~ paste0(OBS, "; 4.pre. Missing discharge dates due administrative truncation in 2019, updated"), T~OBS))|>
mutate(OBS= case_when(rn %in% setdiff(rows_truncated_treatments_due_to_retrieval_2019, hashs_dates_updated_disch_date$rny)~ paste0(OBS, "; 4.pre. Missing discharge dates due administrative truncation in 2019, imputed"), T~OBS))|>
(\(df) {
cat(paste0("4.pre. Database after correcting discharge dates, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.pre. Database after discarding discharge dates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df)> nrow(SISTRAT23_c1_2010_2022_df_prev1g))stop("Error: Added treatment episodes in the process")
df
})() |>
select(-date_disch)#, -tr_comp)
table(SISTRAT23_c1_2010_2022_df_prev1h$tr_comp)|>
data.frame()|>
left_join(data.frame(prop.table(table(SISTRAT23_c1_2010_2022_df_prev1h$tr_comp))), by="Var1")|>
rename("n"="Freq.x", "%"="Freq.y")|>
mutate(`%`=scales::percent(`%`))|>
left_join(psych::describeBy(SISTRAT23_c1_2010_2022_df_prev1h$dit_rec1, SISTRAT23_c1_2010_2022_df_prev1h$tr_comp, mat=T, quant = c(0.25, 0.75), digits=2)[,c("group1", "mean", "median", "Q0.25", "Q0.75")], by= c("Var1"="group1"))|>
knitr::kable("markdown", caption= "Tr compliance status of the updated records")
#remove original variable. Now we have the merged (tr_compliance_rec)
SISTRAT23_c1_2010_2022_df_prev1h$tr_comp <- NULL
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("2025-06-05: Maureen saved some cases\n")
maureen_casos_disch_miss <- readr::read_tsv("https://docs.google.com/spreadsheets/u/2/d/1X0jBuHooVx5RnV9p3fu0tqOlb9JVSyT0ZRSrhxM4kEI/export?format=tsv&id=1X0jBuHooVx5RnV9p3fu0tqOlb9JVSyT0ZRSrhxM4kEI&gid=0")Code
obtained_from_senda_professional_jun_2025<-
SISTRAT23_c1_2010_2022_df_prev1h |>
mutate(concat= paste0(hash_key,"_",adm_date_rec)) |>
inner_join(mutate(maureen_casos_disch_miss, disch= readr::parse_date(`rescate FE`, format="%m/%d/%Y")), by="concat") |> tidytable::select(rn,hash_key, tr_compliance_rec, disch_date_rec0, disch_date_rec0_num, dit_rec1, concat, disch) |>
group_by(hash_key)|>
mutate(
corr_disch = if_else(
n() == 1 & disch_date_rec0 == as.Date("2019-12-31"),
disch,
as.Date(NA)
)
)|>
ungroup() |> filter(!is.na(corr_disch))
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("Inclusion of information provided by SENDA professionals, june 2025")
SISTRAT23_c1_2010_2022_df_prev1h <-
SISTRAT23_c1_2010_2022_df_prev1h |>
(\(df) {
cat(paste0("4.*. Database before correcting discharge dates, provided June 25, cases: ", formatC(nrow(df), big.mark=",")), "\n")
cat(paste0("4.*. Database before correcting discharge dates, provided June 25, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")), "\n")
df
})() |>
left_join(obtained_from_senda_professional_jun_2025 |> select(rn, corr_disch), by = "rn") |>
mutate(
disch_date_rec0 = if_else(!is.na(corr_disch), corr_disch, disch_date_rec0),
disch_date_rec0_num = unclass(disch_date_rec0),
dit_rec1 = if_else(!is.na(corr_disch), disch_date_rec0_num - adm_date_rec_num, dit_rec1),
#tr_compliance_rec = if_else(!is.na(corr_disch), NA_character_, tr_compliance_rec),
OBS = if_else(!is.na(corr_disch), paste0(OBS, "; 4.*.replaced missing disch dates"), OBS)
) |>
select(-corr_disch) |>
(\(df) {
cat(paste0("4.*. Database after correcting discharge dates, provided June 25, cases: ", formatC(nrow(df), big.mark=",")), "\n")
cat(paste0("4.*. Database after correcting discharge dates, provided June 25, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")), "\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1h)) stop("Error: Added treatment episodes in the process")
df
})()Format as most as possible the updated 2019 database in terms of datesTable of dates of discharge with days in treatment
2009-03-23 2009-10-01 2010-01-20 2010-04-03 2010-10-01 2019-11-05 2019-11-13
1 1 1 1 1 382 506
2023-04-28 2023-05-03
3995 1
We should discard dates previous to 2023-04-28 because they were part of the actuala dministrative truncation process
New discharge date from updated C1 2019 database (discarding discharges in 2024-04-28; COINCIDENCE by HASH & admission date), cases: 330
New discharge date from updated C1 2019 database (discarding discharges in 2024-04-28; COINCIDENCE by HASH & admission date), RUNs: 330
Lets check yearly database origin. Where do they come from?...
TABLE_rec n percent
20191 497 1
Corrected the database with updated discharge dates4.pre. Database before correcting discharge dates, cases: 150,187
4.pre. Database before discarding discharge dates, RUNs: 106,283
4.pre. Database after correcting discharge dates, cases: 150,187
4.pre. Database after discarding discharge dates, RUNs: 106,283
| Var1 | n | % | mean | median | Q0.25 | Q0.75 |
|---|---|---|---|---|---|---|
| completion | 59 | 17.9% | 372.66 | 332.0 | 223.50 | 447.50 |
| early adm discharge | 2 | 0.6% | 42.50 | 42.5 | 21.75 | 63.25 |
| early dropout | 63 | 19.1% | 64.46 | 67.0 | 52.50 | 82.00 |
| late adm discharge | 12 | 3.6% | 402.75 | 291.5 | 148.75 | 513.50 |
| late dropout | 162 | 49.1% | 197.35 | 161.5 | 121.50 | 244.75 |
| referral | 32 | 9.7% | 197.41 | 148.0 | 90.75 | 244.00 |
2025-06-05: Maureen saved some cases
Inclusion of information provided by SENDA professionals, june 20254.*. Database before correcting discharge dates, provided June 25, cases: 150,187
4.*. Database before correcting discharge dates, provided June 25, RUNs: 106,283
4.*. Database after correcting discharge dates, provided June 25, cases: 150,187
4.*. Database after correcting discharge dates, provided June 25, RUNs: 106,283
We corrected dates of discharge in numeric (disch_date_rec0_num) and date (disch_date_rec0) formats with “2019-12-31” and 18261. Also we corrected the days in treatment into dit_rec1. Finally, we recoded the tr_compliance variable to indicate that the treatment was truncated due to administrative reasons (tr_compliance_rec).
0. Rule-based deduplication
In order to find and delete duplicated data that does not add information relevant for the purposes of the study, we now may use these standardized variables as a criteria to achieve the goal of having a unique event per HASH, by reducing its complexity based on irrelevant differences.
0.a. Deduplication based on standardized columns of interest for the study
An analysis based on the following criteria, ended with an index of how many differences are within cases with the same HASH and date of admission, and in determining which variables can be tolerable to have differences. For example, if two or more cases share the same date of admission and hash, but most of the variables are different, it is possible to think that information may be lost if one of them is deleted. In another example, if two or more cases share the same date of admission and hash, but the only differences are observed in the days of treatment, one may think that only the case with more treatment days must be preserved.
- hash_key= Masked Identifier (RUN)
- region_del_centro= Chilean Region of the Center
- dit_rec= Days of Treatment
- adm_date_rec_num= Date of Admission to Treatment
- disch_date= Date of Discharge from Treatment
- id_centro= Treatment Center ID
- codigo_identificacion = SENDA ID
- adm_age_rec= Age at Admission to Treatment
- age_subs_onset= Age of Onset of Drug Use
- age_prim_subs_onset= Age of Onset of Drug Use Primary Substance
- type_center= Type of Center
- nacionalidad= Nationality
- etnia= Ethnicity
- diagnostico_trs_psiquiatrico_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria
- diagnostico_trs_psiquiatrico_sub_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (sub-classification)
- x2_diagnostico_trs_psiquiatrico_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (2)
- x2_diagnostico_trs_psiquiatrico_sub_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (sub-classification) (2)
- x3_diagnostico_trs_psiquiatrico_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (3)
- x3_diagnostico_trs_psiquiatrico_sub_dsm_iv= Diagnosis of Psychiatric Disorders, DSM-IV criteria (sub-classification) (3)
- diagnostico_trs_psiquiatrico_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria
- diagnostico_trs_psiquiatrico_sub_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (subclassification)
- x2_diagnostico_trs_psiquiatrico_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (2)
- x2_diagnostico_trs_psiquiatrico_sub_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (subclassification) (2)
- x3_diagnostico_trs_psiquiatrico_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (3)
- x3_diagnostico_trs_psiquiatrico_sub_cie_10= Diagnosis of Psychiatric Disorders, CIE-10 criteria (subclassification) (3)
- sub_dep_icd10_status= Drug dependence diagnosis
- biopsych_comp= Biopsychosocial compromise
- sexo= Sex of User
- plan_type= Type of Plan
- tipo_de_programa_2= Type of Program
- tr_compliance= Cause of Discharge (with late and early withdrawal)
- primary_sub= Primary or Main Substance of Consumption
- second_sub1= Other Substances (1)
- second_sub2= Other Substances (2)
- second_sub3= Other Substances (3)
- first_sub_used= Starting Substance
- marital_status= Marital Status
- occupation_condition= Occupational Status
- occupation_status= Occupational Category
- adm_motive= Motive of Admission to Treatment
- ed_attainment= Educational Attainment
- prim_sub_route= Route of Administration of the Primary or Main Substance
- prim_sub_freq= Frequency of Consumption of the Primary or Main Substance
- municipallity_res_cutpre18= Commune/municipallity of residence
Code
criterios_show<- c('hash_key', 'region_del_centro', 'dit_rec', 'adm_date_rec_num', 'disch_date', 'id_centro', 'codigo_identificacion ', 'adm_age_rec', 'age_subs_onset', 'age_prim_subs_onset', 'type_center', 'nacionalidad', 'etnia', 'diagnostico_trs_psiquiatrico_dsm_iv', 'diagnostico_trs_psiquiatrico_sub_dsm_iv', 'x2_diagnostico_trs_psiquiatrico_dsm_iv', 'x2_diagnostico_trs_psiquiatrico_sub_dsm_iv', 'x3_diagnostico_trs_psiquiatrico_dsm_iv', 'x3_diagnostico_trs_psiquiatrico_sub_dsm_iv', 'diagnostico_trs_psiquiatrico_cie_10', 'diagnostico_trs_psiquiatrico_sub_cie_10', 'x2_diagnostico_trs_psiquiatrico_cie_10', 'x2_diagnostico_trs_psiquiatrico_sub_cie_10', 'x3_diagnostico_trs_psiquiatrico_cie_10', 'x3_diagnostico_trs_psiquiatrico_sub_cie_10', 'sub_dep_icd10_status', 'biopsych_comp', 'sexo', 'plan_type', 'tipo_de_programa_2', 'tr_compliance', 'primary_sub', 'second_sub1', 'second_sub2', 'second_sub3', 'first_sub_used', 'marital_status', 'occupation_condition', 'occupation_status', 'adm_motive', 'ed_attainment', 'prim_sub_route', 'prim_sub_freq', 'municipallity_res_cutpre18')
#Duplicated entries
## according to DVG of 2020
### SENDA yes vs. no
### Earlier database
### More treatment days
### No discharge dates missing
### cases in study?. out
### cases with greater frequency of substance useThis section is no longer pertinent, as we have excluded patients with records that share identical admission dates.
0.b. Deduplication from the Overlap Between Dates of Admission & Discharge
Once the duplicated cases were discarded, we searched for cases in which dates ranges were overlapped with other treatments for the same user (HASH). To search different overlappings, we had to temporarily replace those cases that did not have a date of discharge, with the date of retrieval of the datasets that was “2024-04-28” [disch_date_num_miss] (dates are in the format “years-month-day” in this document).
Code
# dias_en_tratamiento
CONS_C1_df_dup_intervals<-
SISTRAT23_c1_2010_2022_df_prev1h|>
mutate(disch_date_num_miss= ifelse(is.na(disch_date_rec0_num), 19475, disch_date_rec0_num))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
rename("hash_key_2"="hash_key", "rn2"="rn")|>
select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec1, id_centro, tr_compliance_rec, plan_type, senda)|>
#dplyr::filter(motivodeegreso!="Derivación")|>
data.table::as.data.table()
overlap_dates_C1 <- janitor::clean_names(
sqldf::sqldf(
"
SELECT *
FROM CONS_C1_df_dup_intervals AS x
INNER JOIN CONS_C1_df_dup_intervals AS y
ON x.hash_key_2 = y.hash_key_2
AND x.rn2 < y.rn2 -- Avoids duplicates (eg.: x vs y and then y vs x)
AND x.adm_date_rec_num < y.disch_date_num_miss -- x Admitted before being admitted into another treatment
AND x.disch_date_num_miss > y.adm_date_rec_num -- x Discharged after being admitted in other
"
)) |>
`colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1", "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2"))
cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1)),"\n")
cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1, hash_key_1))))
#Number of overlapped dates, observations: 1554 june 2025; 1562; march 2025 1659 ; in 2020, 1,448
#Number of overlapped dates, RUNs: 1413 june 2025; 1420; march 2025 1491; in 2020, 173
#The rows on the left originate from older databases.
CONS_C1_df_dup_overlaps_COMP <-
as_tidytable(overlap_dates_C1)|>
mutate(pair_id= paste0(rn_1,"_",rn_2))|>
mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no", T~NA_character_))|>
mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
select(-hash_key_2) |>
rename("hash_key"="hash_key_1")
####
CONS_C1_df_dup_overlaps_COMP|>
(\(df) {
mutate(df, hash_key= as.numeric(factor(hash_key)))|> rio::export("_out/_overlaps_dup_step_2.xlsx") #for visual comparison in excel
knitr::kable(filter(df, hash_key %in% pull(sample_n_with_seed(CONS_C1_df_dup_overlaps_COMP,20, seed=2125),"hash_key"))|> mutate(hash_key= as.numeric(factor(hash_key))), format = "html", format.args = list(decimal.mark = ".", big.mark = ","),
caption="Cases with overlapped treatment ranges", align = rep('c', 32),
#col.names = c("Row No.(1)", "HASH", "Year of\nDataset(1)", "Admission age(1)", "Admission\ndate(1)(a)", "Admission\ndate(1)(b)", "Discharge\ndate(1)(a)", "Discharge\ndate(1)(b)", "Treatment Days(1)", "Center ID(1)", "Cause of\nDischarge(1)", "Plan Type(1)", "SENDA(1)", "Row No.(2)", "Year of\nDataset(2)", "Admission age(2)", "Admission\ndate(2)(a)", "Admission\ndate(2)(b)", "Discharge\ndate(2)(a)", "Discharge\ndate(2)(b)", "Treatment Days(2)", "Center ID(2)", "Cause of\nDischarge(2)", "Plan Type(2)", "SENDA(2)", "Same Center ID", "Earlier Dataset \nof 2nd Treatment", "Financed \nBy SENDA", "Referral", "Days Overlapped", "2nd treatment has more treatment days", "1st treatment\nabsorbs 2nd")
) |>
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 8)|>
kableExtra::add_footnote( c("Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;", "a= date; b= numeric", "Same Center ID= If both cases share the same Center ID", "Financed By SENDA= If both cases are financed by SENDA;", "Referral= If the cause of discharge is the referral from another center (1= Referral);","Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment","2nd treatment has more treatment days= Earlier treatment has more days of treatment"), notation = "none")|>
kableExtra::scroll_box(width = "100%", height = "375px")
})()Number of overlapped dates, observations: 1554
Number of overlapped dates, RUNs: 1413
| rn_1 | hash_key | ano_bd_1 | adm_age_1 | adm_date_1 | adm_date_rec_num_1 | disch_date_1 | disch_date_num_1 | dit_1 | id_centro_1 | tr_compliance_1 | plan_type_1 | senda_1 | rn_2 | ano_bd_2 | adm_age_2 | adm_date_2 | adm_date_rec_num_2 | disch_date_2 | disch_date_num_2 | dit_2 | id_centro_2 | tr_compliance_2 | plan_type_2 | senda_2 | pair_id | same_id | bd_2_earlier | senda_status | referral | days_overlapped | more_dit | trat_1_within_2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 49,390 | 1 | 2014 | 35.81 | 2013-06-17 | 15,873 | 2014-01-27 | 16,097 | 224 | 166 | late dropout | pg-pab | si | 53,962 | 2014 | 36.35 | 2013-12-30 | 16,069 | 2014-01-03 | 16,073 | 4 | 163 | early dropout | m-pr | si | 49390_53962 | 0 | 0 | both yes | 0 | 28 | 0 | 0 |
| 64,649 | 2 | 2014 | 37.92 | 2014-11-11 | 16,385 | 2014-12-01 | 16,405 | 20 | 212 | referral | pg-pai | si | 70,654 | 2015 | 37.97 | 2014-11-28 | 16,402 | 2014-12-31 | 16,435 | 33 | 216 | referral | pg-pr | si | 64649_70654 | 0 | 1 | both yes | 1 | 3 | 1 | 0 |
| 62,072 | 3 | 2014 | 65.68 | 2014-09-25 | 16,338 | 2014-12-02 | 16,406 | 68 | 262 | referral | m-pai | si | 70,640 | 2015 | 65.87 | 2014-12-01 | 16,405 | 2015-10-13 | 16,721 | 316 | 258 | late adm discharge | m-pr | si | 62072_70640 | 0 | 1 | both yes | 1 | 1 | 1 | 0 |
| 18,306 | 4 | 2011 | 23.71 | 2011-08-10 | 15,196 | 2011-09-30 | 15,247 | 51 | 259 | completion | pg-pai | si | 22,738 | 2012 | 23.84 | 2011-09-28 | 15,245 | 2012-04-25 | 15,455 | 210 | 257 | referral | pg-pr | si | 18306_22738 | 0 | 1 | both yes | 0 | 2 | 1 | 0 |
| 21,516 | 5 | 2012 | 28.45 | 2011-06-06 | 15,131 | 2012-04-27 | 15,457 | 326 | 262 | referral | m-pai | si | 26,680 | 2012 | 29.33 | 2012-04-25 | 15,455 | 2012-06-12 | 15,503 | 48 | 275 | referral | m-pr | si | 21516_26680 | 0 | 0 | both yes | 1 | 2 | 0 | 0 |
| 28,059 | 5 | 2012 | 29.49 | 2012-06-21 | 15,512 | 2012-10-05 | 15,618 | 106 | 262 | referral | m-pai | si | 48,008 | 2014 | 29.78 | 2012-10-04 | 15,617 | 2014-04-01 | 16,161 | 544 | 302 | completion | m-pr | si | 28059_48008 | 0 | 1 | both yes | 1 | 1 | 1 | 0 |
| 34,898 | 6 | 2013 | 40.94 | 2012-11-05 | 15,649 | 2014-01-29 | 16,099 | 450 | 328 | late dropout | pg-pab | si | 53,984 | 2014 | 42.14 | 2014-01-20 | 16,090 | 2014-05-06 | 16,196 | 106 | 502 | late dropout | pg-pab | si | 34898_53984 | 0 | 1 | both yes | 0 | 9 | 0 | 0 |
| 75,748 | 7 | 2015 | 21.66 | 2015-03-24 | 16,518 | 2015-07-09 | 16,625 | 107 | 181 | referral | pg-pai | si | 79,451 | 2015 | 21.95 | 2015-07-06 | 16,622 | 2015-09-14 | 16,692 | 70 | 189 | completion | m-pr | si | 75748_79451 | 0 | 0 | both yes | 1 | 3 | 0 | 0 |
| 60,659 | 8 | 2014 | 39.43 | 2014-07-11 | 16,262 | 2014-09-12 | 16,325 | 63 | 141 | early dropout | pg-pab | si | 62,156 | 2014 | 39.58 | 2014-09-01 | 16,314 | 2014-11-25 | 16,399 | 85 | 141 | early dropout | pg-pab | si | 60659_62156 | 1 | 0 | both yes | 0 | 11 | 1 | 0 |
| 43,322 | 9 | 2013 | 27.85 | 2013-08-26 | 15,943 | 2013-12-20 | 16,059 | 116 | 209 | referral | pg-pab | si | 52,869 | 2014 | 28.16 | 2013-12-17 | 16,056 | 2014-03-01 | 16,130 | 74 | 432 | early dropout | m-pr | si | 43322_52869 | 0 | 1 | both yes | 1 | 3 | 0 | 0 |
| 48,546 | 10 | 2014 | 31.32 | 2013-03-11 | 15,775 | 2014-04-21 | 16,181 | 406 | 294 | late dropout | pg-pab | si | 59,129 | 2014 | 32.15 | 2014-01-06 | 16,076 | 2014-11-12 | 16,386 | 310 | 294 | late dropout | pg-pai | si | 48546_59129 | 1 | 0 | both yes | 0 | 105 | 0 | 0 |
| 51,732 | 11 | 2014 | 42.85 | 2013-11-04 | 16,013 | 2014-03-21 | 16,150 | 137 | 290 | referral | pg-pai | si | 66,455 | 2015 | 43.15 | 2014-02-24 | 16,125 | 2015-06-12 | 16,598 | 473 | 303 | completion | pg-pr | si | 51732_66455 | 0 | 1 | both yes | 1 | 25 | 1 | 0 |
| 24,214 | 12 | 2012 | 37.88 | 2012-01-31 | 15,370 | 2012-03-05 | 15,404 | 34 | 163 | early dropout | pg-pr | si | 33,099 | 2013 | 36.14 | 2010-05-07 | 14,736 | 2013-05-31 | 15,856 | 1,120 | 166 | late dropout | pg-pab | si | 24214_33099 | 0 | 1 | both yes | 0 | 668 | 1 | 1 |
| 64,530 | 13 | 2014 | 59.30 | 2014-11-04 | 16,378 | 2014-11-05 | 16,379 | 1 | early adm discharge | pg-pr | si | 86,673 | 2016 | 59.28 | 2014-10-27 | 16,370 | 2016-03-29 | 16,889 | 519 | 105 | completion | pg-pai | si | 64530_86673 | 1 | both yes | 0 | 9 | 1 | 1 | ||
| 75,380 | 14 | 2015 | 23.71 | 2015-04-02 | 16,527 | 2015-08-26 | 16,673 | 146 | 441 | referral | pg-pai | si | 89,030 | 2016 | 24.09 | 2015-08-18 | 16,665 | 2016-04-01 | 16,892 | 227 | 650 | late dropout | pg-pr | si | 75380_89030 | 0 | 1 | both yes | 1 | 8 | 1 | 0 |
| 127,231 | 15 | 2018 | 24.89 | 2017-04-11 | 17,267 | 2018-04-24 | 17,645 | 378 | 119 | referral | pg-pai | si | 136,733 | 2018 | 25.89 | 2018-04-11 | 17,632 | 2018-09-01 | 17,775 | 143 | 117 | late dropout | pg-pr | si | 127231_136733 | 0 | 0 | both yes | 1 | 13 | 0 | 0 |
| 105,585 | 16 | 2017 | 33.74 | 2015-02-18 | 16,484 | 2017-01-31 | 17,197 | 713 | 288 | referral | pg-pab | si | 112,500 | 2017 | 35.65 | 2017-01-16 | 17,182 | 2017-08-04 | 17,382 | 200 | 357 | completion | pg-pr | si | 105585_112500 | 0 | 0 | both yes | 1 | 15 | 0 | 0 |
| 25,237 | 17 | 2012 | 38.88 | 2012-03-06 | 15,405 | 2012-04-12 | 15,442 | 37 | 171 | referral | pg-pai | si | 65,490 | 2015 | 37.64 | 2010-12-10 | 14,953 | 2015-05-29 | 16,584 | 1,631 | 166 | completion | pg-pai | si | 25237_65490 | 0 | 1 | both yes | 1 | 489 | 1 | 1 |
| 33,552 | 17 | 2013 | 39.29 | 2012-08-01 | 15,553 | 2013-02-01 | 15,737 | 184 | 171 | referral | pg-pai | si | 65,490 | 2015 | 37.64 | 2010-12-10 | 14,953 | 2015-05-29 | 16,584 | 1,631 | 166 | completion | pg-pai | si | 33552_65490 | 0 | 1 | both yes | 1 | 784 | 1 | 1 |
| 25,522 | 18 | 2012 | 21.02 | 2012-01-27 | 15,366 | 2012-03-30 | 15,429 | 63 | 225 | referral | pg-pai | si | 25,941 | 2012 | 21.18 | 2012-03-26 | 15,425 | 2012-06-30 | 15,521 | 96 | 235 | late dropout | pg-pr | si | 25522_25941 | 0 | 0 | both yes | 1 | 4 | 1 | 0 |
| 210,165 | 19 | 2022 | 39.30 | 2021-11-11 | 18,942 | 2022-07-20 | 19,193 | 251 | 703 | referral | pg-pai | si | 217,904 | 2022 | 39.98 | 2022-07-18 | 19,191 | 19,475 | 162 | currently in | pg-pr | si | 210165_217904 | 0 | 0 | both yes | 1 | 2 | 0 | |||
| 52,331 | 20 | 2014 | 54.02 | 2013-12-03 | 16,042 | 2014-03-31 | 16,160 | 118 | 365 | referral | pg-pai | si | 66,690 | 2015 | 54.29 | 2014-03-14 | 16,143 | 2015-02-02 | 16,468 | 325 | 179 | completion | pg-pr | si | 52331_66690 | 0 | 1 | both yes | 1 | 17 | 1 | 0 |
| Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case; | ||||||||||||||||||||||||||||||||
| a= date; b= numeric | ||||||||||||||||||||||||||||||||
| Same Center ID= If both cases share the same Center ID | ||||||||||||||||||||||||||||||||
| Financed By SENDA= If both cases are financed by SENDA; | ||||||||||||||||||||||||||||||||
| Referral= If the cause of discharge is the referral from another center (1= Referral); | ||||||||||||||||||||||||||||||||
| Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment | ||||||||||||||||||||||||||||||||
| 2nd treatment has more treatment days= Earlier treatment has more days of treatment |
We identified 1,554 overlappings. Some of the users appeared more than once (n= 95); those users may have competing dates of discharge, which will have to be chosen based on their individual trajectories.
0.b.0 Multiple overlappings
We first focused on cases that had multiple overlappings. These will be revised latter.
Code
# c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1", "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2")
overlaps_after_miss_appear_more_than_one_time<-
CONS_C1_df_dup_overlaps_COMP |>
tidytable::pivot_longer(
cols = matches("_[12]$"), # All columns ending with _1 or _2
names_to = c(".value", "wave"),
names_pattern = "(.+)_([12])",
values_drop_na = FALSE) |>
group_by(rn) |>
count() |>
filter(n>1) |> pull(rn)
multiple_overlaps <-
CONS_C1_df_dup_overlaps_COMP |> filter(rn_1 %in% overlaps_after_miss_appear_more_than_one_time|rn_2 %in% overlaps_after_miss_appear_more_than_one_time)|>
(\(df) {
cat(paste0("00. Multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("00. Multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
tidytable::pivot_longer(
cols = matches("_[12]$"), # All columns ending with _1 or _2
names_to = c(".value", "wave"),
names_pattern = "(.+)_([12])",
values_drop_na = FALSE)|>
filter(dit<1095, senda!="no")|>
group_by(hash_key)|>
mutate(
max_ano_bd = max(ano_bd, na.rm = TRUE),
max_disch_date_num = max(disch_date_num, na.rm = TRUE)
)|>
# 1. Prioritize completed treatments
arrange(
# 2. Then longest duration
desc(dit),
# 3. Then most recent retrieval year of the database
desc(max_ano_bd),
# 4. Then most recent discharge date
desc(max_disch_date_num)
)|>
# Keep only the top-ranked row per group
slice(1)
#
# 00. Multiple overlappings, cases: 221; june 2025 174
# 00. Multiple overlappings, RUNs: 87; june 2025 67
invisible("These rules are too simplistic. I did not use them")
CONS_C1_df_dup_overlaps_COMP|> filter(rn_1 %in% overlaps_after_miss_appear_more_than_one_time|rn_2 %in% overlaps_after_miss_appear_more_than_one_time)|>
(\(df) {
rio::export(df, "_out/_multiple_overlappings.xlsx") #for visual comparison in excel
knitr::kable(mutate(df, hash_key= as.numeric(factor(hash_key))), format = "html", format.args = list(decimal.mark = ".", big.mark = ","), caption="Cases with multiple overlappings", align = rep('c', 32)
#col.names = c("Row No.(1)", "HASH", "Year of\nDataset(1)", "Admission age(1)", "Admission\ndate(1)(a)", "Admission\ndate(1)(b)", "Discharge\ndate(1)(a)", "Discharge\ndate(1)(b)", "Treatment Days(1)", "Center ID(1)", "Cause of\nDischarge(1)", "Plan Type(1)", "SENDA(1)", "Row No.(2)", "Year of\nDataset(2)", "Admission age(2)", "Admission\ndate(2)(a)", "Admission\ndate(2)(b)", "Discharge\ndate(2)(a)", "Discharge\ndate(2)(b)", "Treatment Days(2)", "Center ID(2)", "Cause of\nDischarge(2)", "Plan Type(2)", "SENDA(2)", "Same Center ID", "Earlier Dataset \nof 2nd Treatment", "Financed \nBy SENDA", "Referral", "Days Overlapped", "2nd treatment has more treatment days", "1st treatment\nabsorbs 2nd")
) |>
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 8)|>
kableExtra::add_footnote( c("Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;", "a= date; b= numeric", "Same Center ID= If both cases share the same Center ID", "Financed By SENDA= If both cases are financed by SENDA;", "Referral= If the cause of discharge is the referral from another center (1= Referral);","Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment","2nd treatment has more treatment days= Earlier treatment has more days of treatment"), notation = "none")|>
kableExtra::scroll_box(width = "100%", height = "375px")
})()00. Multiple overlappings, cases: 174
00. Multiple overlappings, RUNs: 67
| rn_1 | hash_key | ano_bd_1 | adm_age_1 | adm_date_1 | adm_date_rec_num_1 | disch_date_1 | disch_date_num_1 | dit_1 | id_centro_1 | tr_compliance_1 | plan_type_1 | senda_1 | rn_2 | ano_bd_2 | adm_age_2 | adm_date_2 | adm_date_rec_num_2 | disch_date_2 | disch_date_num_2 | dit_2 | id_centro_2 | tr_compliance_2 | plan_type_2 | senda_2 | pair_id | same_id | bd_2_earlier | senda_status | referral | days_overlapped | more_dit | trat_1_within_2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 71,738 | 1 | 2015 | 26.52 | 2014-06-30 | 16,251 | 2015-07-08 | 16,624 | 373 | 294 | referral | pg-pab | si | 78,750 | 2015 | 27.52 | 2015-06-30 | 16,616 | 2016-02-01 | 16,832 | 216 | 559 | late dropout | pg-pai | si | 71738_78750 | 0 | 0 | both yes | 1 | 8 | 0 | 0 |
| 61,433 | 1 | 2014 | 26.65 | 2014-08-18 | 16,300 | 2014-11-25 | 16,399 | 99 | 297 | completion | pg-pr | si | 71,738 | 2015 | 26.52 | 2014-06-30 | 16,251 | 2015-07-08 | 16,624 | 373 | 294 | referral | pg-pab | si | 61433_71738 | 0 | 1 | both yes | 0 | 148 | 1 | 1 |
| 97,268 | 2 | 2016 | 25.85 | 2016-05-24 | 16,945 | 2016-09-08 | 17,052 | 107 | 161 | late dropout | pg-pai | si | 126,406 | 2018 | 26.12 | 2016-08-30 | 17,043 | 2018-08-01 | 17,744 | 701 | 615 | completion | pg-pab | si | 97268_126406 | 0 | 1 | both yes | 0 | 9 | 1 | 0 |
| 103,121 | 2 | 2016 | 26.27 | 2016-10-24 | 17,098 | 2016-12-01 | 17,136 | 38 | 161 | early dropout | pg-pai | si | 126,406 | 2018 | 26.12 | 2016-08-30 | 17,043 | 2018-08-01 | 17,744 | 701 | 615 | completion | pg-pab | si | 103121_126406 | 0 | 1 | both yes | 0 | 93 | 1 | 1 |
| 35,598 | 3 | 2013 | 32.61 | 2013-01-02 | 15,707 | 2013-01-25 | 15,730 | 23 | 161 | referral | pg-pai | si | 36,801 | 2013 | 32.57 | 2012-12-20 | 15,694 | 2013-05-30 | 15,855 | 161 | 161 | late dropout | pg-pai | si | 35598_36801 | 1 | 0 | both yes | 1 | 36 | 1 | 1 |
| 35,933 | 3 | 2013 | 32.67 | 2013-01-25 | 15,730 | 2013-01-27 | 15,732 | 2 | 289 | early dropout | pg-pr | si | 36,801 | 2013 | 32.57 | 2012-12-20 | 15,694 | 2013-05-30 | 15,855 | 161 | 161 | late dropout | pg-pai | si | 35933_36801 | 0 | 0 | both yes | 0 | 38 | 1 | 1 |
| 210,000 | 4 | 2022 | 33.96 | 2021-10-18 | 18,918 | 19,475 | 415 | currently in | pg-pab | si | 214,345 | 2022 | 34.38 | 2022-03-22 | 19,073 | 2022-05-24 | 19,136 | 63 | 248 | referral | pg-pai | si | 210000_214345 | 0 | 0 | both yes | 0 | 402 | 0 | |||
| 210,000 | 4 | 2022 | 33.96 | 2021-10-18 | 18,918 | 19,475 | 415 | currently in | pg-pab | si | 215,964 | 2022 | 34.56 | 2022-05-25 | 19,137 | 19,475 | 266 | currently in | pg-pr | si | 210000_215964 | 0 | 0 | both yes | 0 | 338 | 0 | |||||
| 5,037 | 5 | 2010 | 31.71 | 2009-03-04 | 14,307 | 2010-05-31 | 14,760 | 453 | 109 | referral | pg-pai | si | 6,345 | 2010 | 31.73 | 2009-03-12 | 14,315 | 2010-12-31 | 14,974 | 659 | 109 | referral | pg-pab | si | 5037_6345 | 1 | 0 | both yes | 1 | 445 | 1 | 0 |
| 5,037 | 5 | 2010 | 31.71 | 2009-03-04 | 14,307 | 2010-05-31 | 14,760 | 453 | 109 | referral | pg-pai | si | 10,895 | 2011 | 31.74 | 2009-03-15 | 14,318 | 2011-03-31 | 15,064 | 746 | 109 | referral | pg-pai | no | 5037_10895 | 1 | 1 | 1 | 442 | 1 | 0 | |
| 5,037 | 5 | 2010 | 31.71 | 2009-03-04 | 14,307 | 2010-05-31 | 14,760 | 453 | 109 | referral | pg-pai | si | 16,311 | 2011 | 31.71 | 2009-03-03 | 14,306 | 2011-12-01 | 15,309 | 1,003 | 109 | referral | pg-pab | si | 5037_16311 | 1 | 1 | both yes | 1 | 454 | 1 | 1 |
| 6,345 | 5 | 2010 | 31.73 | 2009-03-12 | 14,315 | 2010-12-31 | 14,974 | 659 | 109 | referral | pg-pab | si | 10,895 | 2011 | 31.74 | 2009-03-15 | 14,318 | 2011-03-31 | 15,064 | 746 | 109 | referral | pg-pai | no | 6345_10895 | 1 | 1 | 1 | 656 | 1 | 0 | |
| 6,345 | 5 | 2010 | 31.73 | 2009-03-12 | 14,315 | 2010-12-31 | 14,974 | 659 | 109 | referral | pg-pab | si | 16,311 | 2011 | 31.71 | 2009-03-03 | 14,306 | 2011-12-01 | 15,309 | 1,003 | 109 | referral | pg-pab | si | 6345_16311 | 1 | 1 | both yes | 1 | 668 | 1 | 1 |
| 10,895 | 5 | 2011 | 31.74 | 2009-03-15 | 14,318 | 2011-03-31 | 15,064 | 746 | 109 | referral | pg-pai | no | 16,311 | 2011 | 31.71 | 2009-03-03 | 14,306 | 2011-12-01 | 15,309 | 1,003 | 109 | referral | pg-pab | si | 10895_16311 | 1 | 0 | second yes | 1 | 758 | 1 | 1 |
| 22,218 | 6 | 2012 | 27.55 | 2011-03-04 | 15,037 | 2012-01-20 | 15,359 | 322 | 106 | late dropout | pg-pab | si | 48,023 | 2014 | 27.56 | 2011-03-10 | 15,043 | 2014-06-30 | 16,251 | 1,208 | 106 | completion | pg-pab | si | 22218_48023 | 1 | 1 | both yes | 0 | 316 | 1 | 0 |
| 15,745 | 6 | 2011 | 27.56 | 2011-03-11 | 15,044 | 2011-06-02 | 15,127 | 83 | 106 | early dropout | pg-pab | si | 22,218 | 2012 | 27.55 | 2011-03-04 | 15,037 | 2012-01-20 | 15,359 | 322 | 106 | late dropout | pg-pab | si | 15745_22218 | 1 | 1 | both yes | 0 | 90 | 1 | 1 |
| 15,745 | 6 | 2011 | 27.56 | 2011-03-11 | 15,044 | 2011-06-02 | 15,127 | 83 | 106 | early dropout | pg-pab | si | 48,023 | 2014 | 27.56 | 2011-03-10 | 15,043 | 2014-06-30 | 16,251 | 1,208 | 106 | completion | pg-pab | si | 15745_48023 | 1 | 1 | both yes | 0 | 84 | 1 | 1 |
| 29,888 | 7 | 2012 | 40.20 | 2012-08-22 | 15,574 | 2012-09-28 | 15,611 | 37 | 109 | early adm discharge | pg-pab | si | 30,707 | 2012 | 40.24 | 2012-09-07 | 15,590 | 2012-12-20 | 15,694 | 104 | 109 | referral | pg-pai | si | 29888_30707 | 1 | 0 | both yes | 0 | 21 | 1 | 0 |
| 30,707 | 7 | 2012 | 40.24 | 2012-09-07 | 15,590 | 2012-12-20 | 15,694 | 104 | 109 | referral | pg-pai | si | 37,147 | 2013 | 40.50 | 2012-12-12 | 15,686 | 2013-11-15 | 16,024 | 338 | 117 | late dropout | pg-pr | si | 30707_37147 | 0 | 1 | both yes | 1 | 8 | 1 | 0 |
| 6,593 | 8 | 2010 | 40.92 | 2010-07-09 | 14,799 | 2019-12-31 | 18,261 | 3,462 | 278 | adm truncated | pg-pab | si | 48,343 | 2014 | 43.43 | 2013-01-11 | 15,716 | 2014-05-29 | 16,219 | 503 | 329 | late dropout | pg-pai | si | 6593_48343 | 0 | 1 | both yes | 0 | 2,545 | 0 | 0 |
| 6,593 | 8 | 2010 | 40.92 | 2010-07-09 | 14,799 | 2019-12-31 | 18,261 | 3,462 | 278 | adm truncated | pg-pab | si | 67,665 | 2015 | 44.90 | 2014-07-01 | 16,252 | 2015-07-14 | 16,630 | 378 | 329 | late adm discharge | pg-pai | si | 6593_67665 | 0 | 1 | both yes | 0 | 2,009 | 0 | 0 |
| 6,593 | 8 | 2010 | 40.92 | 2010-07-09 | 14,799 | 2019-12-31 | 18,261 | 3,462 | 278 | adm truncated | pg-pab | si | 134,162 | 2018 | 48.49 | 2018-02-01 | 17,563 | 2018-12-27 | 17,892 | 329 | 601 | referral | pg-pai | si | 6593_134162 | 0 | 1 | both yes | 0 | 698 | 0 | 0 |
| 70,143 | 9 | 2015 | 28.45 | 2014-11-04 | 16,378 | 2015-07-03 | 16,619 | 241 | 178 | referral | m-pai | si | 79,375 | 2015 | 29.11 | 2015-07-01 | 16,617 | 2015-10-08 | 16,716 | 99 | 179 | late adm discharge | pg-pr | si | 70143_79375 | 0 | 0 | both yes | 1 | 2 | 0 | 0 |
| 79,375 | 9 | 2015 | 29.11 | 2015-07-01 | 16,617 | 2015-10-08 | 16,716 | 99 | 179 | late adm discharge | pg-pr | si | 90,320 | 2016 | 29.37 | 2015-10-05 | 16,713 | 2016-12-01 | 17,136 | 423 | 178 | late adm discharge | m-pai | si | 79375_90320 | 0 | 1 | both yes | 0 | 3 | 1 | 0 |
| 2,201 | 10 | 2010 | 25.97 | 2009-11-16 | 14,564 | 2010-01-11 | 14,620 | 56 | 297 | early dropout | pg-pr | si | 6,996 | 2010 | 24.90 | 2008-10-20 | 14,172 | 2010-10-18 | 14,900 | 728 | 161 | completion | pg-pab | no | 2201_6996 | 0 | 0 | 0 | 448 | 1 | 1 | |
| 6,402 | 10 | 2010 | 26.63 | 2010-07-14 | 14,804 | 2010-10-04 | 14,886 | 82 | 161 | early adm discharge | pg-pai | no | 6,996 | 2010 | 24.90 | 2008-10-20 | 14,172 | 2010-10-18 | 14,900 | 728 | 161 | completion | pg-pab | no | 6402_6996 | 1 | 0 | both no | 0 | 714 | 1 | 1 |
| 12,461 | 11 | 2011 | 34.55 | 2010-12-07 | 14,950 | 2011-03-30 | 15,063 | 113 | 171 | referral | pg-pai | si | 38,473 | 2013 | 34.80 | 2011-03-08 | 15,041 | 2013-06-21 | 15,877 | 836 | 166 | referral | pg-pai | si | 12461_38473 | 0 | 1 | both yes | 1 | 22 | 1 | 0 |
| 38,473 | 11 | 2013 | 34.80 | 2011-03-08 | 15,041 | 2013-06-21 | 15,877 | 836 | 166 | referral | pg-pai | si | 41,391 | 2013 | 37.08 | 2013-06-19 | 15,875 | 2013-11-20 | 16,029 | 154 | 163 | referral | pg-pr | si | 38473_41391 | 0 | 0 | both yes | 1 | 2 | 0 | 0 |
| 28,802 | 11 | 2012 | 36.19 | 2012-07-30 | 15,551 | 2012-08-06 | 15,558 | 7 | 171 | referral | pg-pai | si | 38,473 | 2013 | 34.80 | 2011-03-08 | 15,041 | 2013-06-21 | 15,877 | 836 | 166 | referral | pg-pai | si | 28802_38473 | 0 | 1 | both yes | 1 | 517 | 1 | 1 |
| 62,746 | 12 | 2014 | 32.45 | 2014-10-07 | 16,350 | 2015-01-26 | 16,461 | 111 | 124 | referral | pg-pab | si | 62,885 | 2014 | 32.44 | 2014-10-01 | 16,344 | 2014-12-01 | 16,405 | 61 | 124 | early dropout | pg-pab | no | 62746_62885 | 1 | 0 | 1 | 117 | 0 | 0 | |
| 62,746 | 12 | 2014 | 32.45 | 2014-10-07 | 16,350 | 2015-01-26 | 16,461 | 111 | 124 | referral | pg-pab | si | 71,596 | 2015 | 32.75 | 2015-01-22 | 16,457 | 2015-04-01 | 16,526 | 69 | 434 | early dropout | pg-pab | si | 62746_71596 | 0 | 1 | both yes | 1 | 4 | 0 | 0 |
| 92,857 | 13 | 2016 | 34.88 | 2016-01-20 | 16,820 | 2016-06-17 | 16,969 | 149 | 408 | late adm discharge | pg-pai | si | 95,989 | 2016 | 34.91 | 2016-02-01 | 16,832 | 2016-05-04 | 16,925 | 93 | 408 | referral | pg-pai | no | 92857_95989 | 1 | 0 | 0 | 137 | 0 | 0 | |
| 92,857 | 13 | 2016 | 34.88 | 2016-01-20 | 16,820 | 2016-06-17 | 16,969 | 149 | 408 | late adm discharge | pg-pai | si | 107,824 | 2017 | 35.17 | 2016-05-05 | 16,926 | 2017-04-28 | 17,284 | 358 | 189 | completion | m-pr | si | 92857_107824 | 0 | 1 | both yes | 0 | 43 | 1 | 0 |
| 78,289 | 14 | 2015 | 35.03 | 2015-06-18 | 16,604 | 2015-07-28 | 16,644 | 40 | 141 | referral | pg-pai | si | 80,895 | 2015 | 35.02 | 2015-06-12 | 16,598 | 2015-11-26 | 16,765 | 167 | 141 | late dropout | pg-pai | si | 78289_80895 | 1 | 0 | both yes | 1 | 46 | 1 | 1 |
| 79,294 | 14 | 2015 | 35.14 | 2015-07-28 | 16,644 | 2015-08-10 | 16,657 | 13 | 142 | referral | m-pr | si | 80,895 | 2015 | 35.02 | 2015-06-12 | 16,598 | 2015-11-26 | 16,765 | 167 | 141 | late dropout | pg-pai | si | 79294_80895 | 0 | 0 | both yes | 1 | 59 | 1 | 1 |
| 65,616 | 15 | 2015 | 52.83 | 2012-04-09 | 15,439 | 2015-07-23 | 16,639 | 1,200 | 166 | referral | pg-pab | si | 88,257 | 2016 | 56.11 | 2015-07-20 | 16,636 | 2016-03-02 | 16,862 | 226 | 163 | completion | m-pr | si | 65616_88257 | 0 | 1 | both yes | 1 | 3 | 0 | 0 |
| 29,337 | 15 | 2012 | 53.16 | 2012-08-07 | 15,559 | 2012-09-03 | 15,586 | 27 | 104 | completion | m-pr | si | 65,616 | 2015 | 52.83 | 2012-04-09 | 15,439 | 2015-07-23 | 16,639 | 1,200 | 166 | referral | pg-pab | si | 29337_65616 | 0 | 1 | both yes | 0 | 147 | 1 | 1 |
| 35,046 | 15 | 2013 | 53.41 | 2012-11-06 | 15,650 | 2013-04-04 | 15,799 | 149 | 171 | completion | pg-pai | si | 65,616 | 2015 | 52.83 | 2012-04-09 | 15,439 | 2015-07-23 | 16,639 | 1,200 | 166 | referral | pg-pab | si | 35046_65616 | 0 | 1 | both yes | 0 | 360 | 1 | 1 |
| 6,791 | 16 | 2010 | 27.28 | 2009-10-23 | 14,540 | 2010-12-31 | 14,974 | 434 | 122 | completion | pg-pab | si | 12,841 | 2011 | 27.26 | 2009-10-15 | 14,532 | 2011-05-31 | 15,125 | 593 | 122 | completion | m-pai | si | 6791_12841 | 1 | 1 | both yes | 0 | 442 | 1 | 1 |
| 6,105 | 16 | 2010 | 27.93 | 2010-06-17 | 14,777 | 2010-07-01 | 14,791 | 14 | 122 | early adm discharge | pg-pab | no | 6,791 | 2010 | 27.28 | 2009-10-23 | 14,540 | 2010-12-31 | 14,974 | 434 | 122 | completion | pg-pab | si | 6105_6791 | 1 | 0 | second yes | 0 | 251 | 1 | 1 |
| 6,105 | 16 | 2010 | 27.93 | 2010-06-17 | 14,777 | 2010-07-01 | 14,791 | 14 | 122 | early adm discharge | pg-pab | no | 12,841 | 2011 | 27.26 | 2009-10-15 | 14,532 | 2011-05-31 | 15,125 | 593 | 122 | completion | m-pai | si | 6105_12841 | 1 | 1 | second yes | 0 | 259 | 1 | 1 |
| 12,180 | 17 | 2011 | 52.72 | 2008-12-10 | 14,223 | 2012-01-31 | 15,370 | 1,147 | 109 | referral | pg-pai | si | 25,660 | 2012 | 53.02 | 2009-03-31 | 14,334 | 2013-01-31 | 15,736 | 1,402 | 109 | late dropout | pg-pai | si | 12180_25660 | 1 | 1 | both yes | 1 | 1,036 | 1 | 0 |
| 12,180 | 17 | 2011 | 52.72 | 2008-12-10 | 14,223 | 2012-01-31 | 15,370 | 1,147 | 109 | referral | pg-pai | si | 48,425 | 2014 | 53.94 | 2010-03-01 | 14,669 | 2014-09-30 | 16,343 | 1,674 | 109 | completion | pg-pab | si | 12180_48425 | 1 | 1 | both yes | 1 | 701 | 1 | 0 |
| 8,809 | 17 | 2010 | 52.72 | 2008-12-12 | 14,225 | 2010-10-31 | 14,913 | 688 | 109 | referral | pg-pab | si | 12,180 | 2011 | 52.72 | 2008-12-10 | 14,223 | 2012-01-31 | 15,370 | 1,147 | 109 | referral | pg-pai | si | 8809_12180 | 1 | 1 | both yes | 1 | 690 | 1 | 1 |
| 8,809 | 17 | 2010 | 52.72 | 2008-12-12 | 14,225 | 2010-10-31 | 14,913 | 688 | 109 | referral | pg-pab | si | 25,660 | 2012 | 53.02 | 2009-03-31 | 14,334 | 2013-01-31 | 15,736 | 1,402 | 109 | late dropout | pg-pai | si | 8809_25660 | 1 | 1 | both yes | 1 | 579 | 1 | 0 |
| 8,809 | 17 | 2010 | 52.72 | 2008-12-12 | 14,225 | 2010-10-31 | 14,913 | 688 | 109 | referral | pg-pab | si | 48,425 | 2014 | 53.94 | 2010-03-01 | 14,669 | 2014-09-30 | 16,343 | 1,674 | 109 | completion | pg-pab | si | 8809_48425 | 1 | 1 | both yes | 1 | 244 | 1 | 0 |
| 25,660 | 17 | 2012 | 53.02 | 2009-03-31 | 14,334 | 2013-01-31 | 15,736 | 1,402 | 109 | late dropout | pg-pai | si | 48,425 | 2014 | 53.94 | 2010-03-01 | 14,669 | 2014-09-30 | 16,343 | 1,674 | 109 | completion | pg-pab | si | 25660_48425 | 1 | 1 | both yes | 0 | 1,067 | 1 | 0 |
| 20,109 | 18 | 2011 | 59.00 | 2011-11-15 | 15,293 | 2015-01-02 | 16,437 | 1,144 | 236 | late adm discharge | pg-pr | no | 62,620 | 2014 | 61.88 | 2014-10-01 | 16,344 | 2014-11-26 | 16,400 | 56 | 238 | referral | pg-pab | no | 20109_62620 | 0 | 1 | both no | 0 | 93 | 0 | 0 |
| 20,109 | 18 | 2011 | 59.00 | 2011-11-15 | 15,293 | 2015-01-02 | 16,437 | 1,144 | 236 | late adm discharge | pg-pr | no | 72,297 | 2015 | 62.01 | 2014-11-18 | 16,392 | 2015-08-10 | 16,657 | 265 | 258 | completion | pg-pr | si | 20109_72297 | 0 | 1 | second yes | 0 | 45 | 0 | 0 |
| 62,620 | 18 | 2014 | 61.88 | 2014-10-01 | 16,344 | 2014-11-26 | 16,400 | 56 | 238 | referral | pg-pab | no | 72,297 | 2015 | 62.01 | 2014-11-18 | 16,392 | 2015-08-10 | 16,657 | 265 | 258 | completion | pg-pr | si | 62620_72297 | 0 | 1 | second yes | 1 | 8 | 1 | 0 |
| 29,467 | 19 | 2012 | 35.58 | 2009-09-15 | 14,502 | 2012-11-05 | 15,649 | 1,147 | 123 | referral | pg-pab | si | 34,820 | 2013 | 38.65 | 2012-10-10 | 15,623 | 2013-10-21 | 15,999 | 376 | 117 | late dropout | pg-pr | si | 29467_34820 | 0 | 1 | both yes | 1 | 26 | 0 | 0 |
| 21,597 | 19 | 2012 | 37.37 | 2011-06-30 | 15,155 | 2012-07-25 | 15,546 | 391 | 123 | late dropout | pg-pab | si | 29,467 | 2012 | 35.58 | 2009-09-15 | 14,502 | 2012-11-05 | 15,649 | 1,147 | 123 | referral | pg-pab | si | 21597_29467 | 1 | 0 | both yes | 0 | 1,044 | 1 | 1 |
| 45,610 | 20 | 2013 | 38.05 | 2013-10-04 | 15,982 | 2013-12-18 | 16,057 | 75 | 261 | referral | pg-pai | si | 52,801 | 2014 | 38.24 | 2013-12-12 | 16,051 | 2014-10-01 | 16,344 | 293 | 258 | completion | m-pr | si | 45610_52801 | 0 | 1 | both yes | 1 | 6 | 1 | 0 |
| 52,801 | 20 | 2014 | 38.24 | 2013-12-12 | 16,051 | 2014-10-01 | 16,344 | 293 | 258 | completion | m-pr | si | 70,544 | 2015 | 39.04 | 2014-09-30 | 16,343 | 2015-05-27 | 16,582 | 239 | 261 | late dropout | pg-pai | si | 52801_70544 | 0 | 1 | both yes | 0 | 1 | 0 | 0 |
| 336 | 21 | 2010 | 21.88 | 2007-07-17 | 13,711 | 2010-02-16 | 14,656 | 945 | 118 | referral | pg-pai | si | 3,914 | 2010 | 23.75 | 2009-05-29 | 14,393 | 2011-01-05 | 14,979 | 586 | 118 | referral | pg-pab | si | 336_3914 | 1 | 0 | both yes | 1 | 263 | 0 | 0 |
| 336 | 21 | 2010 | 21.88 | 2007-07-17 | 13,711 | 2010-02-16 | 14,656 | 945 | 118 | referral | pg-pai | si | 12,792 | 2011 | 24.35 | 2010-01-04 | 14,613 | 2012-02-01 | 15,371 | 758 | 118 | late adm discharge | pg-pai | si | 336_12792 | 1 | 1 | both yes | 1 | 43 | 0 | 0 |
| 3,914 | 21 | 2010 | 23.75 | 2009-05-29 | 14,393 | 2011-01-05 | 14,979 | 586 | 118 | referral | pg-pab | si | 12,792 | 2011 | 24.35 | 2010-01-04 | 14,613 | 2012-02-01 | 15,371 | 758 | 118 | late adm discharge | pg-pai | si | 3914_12792 | 1 | 1 | both yes | 1 | 366 | 1 | 0 |
| 23,666 | 22 | 2012 | 44.38 | 2012-01-13 | 15,352 | 2012-06-14 | 15,505 | 153 | 142 | completion | pg-pr | si | 38,062 | 2013 | 44.04 | 2011-09-12 | 15,229 | 2013-05-27 | 15,852 | 623 | 291 | completion | pg-pai | si | 23666_38062 | 0 | 1 | both yes | 0 | 276 | 1 | 1 |
| 27,925 | 22 | 2012 | 44.80 | 2012-06-15 | 15,506 | 2012-07-06 | 15,527 | 21 | 142 | referral | m-pr | si | 38,062 | 2013 | 44.04 | 2011-09-12 | 15,229 | 2013-05-27 | 15,852 | 623 | 291 | completion | pg-pai | si | 27925_38062 | 0 | 1 | both yes | 1 | 298 | 1 | 1 |
| 28,306 | 22 | 2012 | 44.87 | 2012-07-09 | 15,530 | 2012-10-30 | 15,643 | 113 | 160 | late adm discharge | m-pai | si | 38,062 | 2013 | 44.04 | 2011-09-12 | 15,229 | 2013-05-27 | 15,852 | 623 | 291 | completion | pg-pai | si | 28306_38062 | 0 | 1 | both yes | 0 | 414 | 1 | 1 |
| 31,548 | 22 | 2012 | 45.23 | 2012-11-19 | 15,663 | 2013-03-11 | 15,775 | 112 | 291 | late dropout | pg-pai | si | 38,062 | 2013 | 44.04 | 2011-09-12 | 15,229 | 2013-05-27 | 15,852 | 623 | 291 | completion | pg-pai | si | 31548_38062 | 1 | 1 | both yes | 0 | 546 | 1 | 1 |
| 12,972 | 23 | 2011 | 21.38 | 2011-01-24 | 14,998 | 2011-06-15 | 15,140 | 142 | 238 | late dropout | pg-pab | si | 21,856 | 2012 | 21.73 | 2011-06-02 | 15,127 | 2013-02-01 | 15,737 | 610 | 246 | completion | pg-pai | si | 12972_21856 | 0 | 1 | both yes | 0 | 13 | 1 | 0 |
| 21,856 | 23 | 2012 | 21.73 | 2011-06-02 | 15,127 | 2013-02-01 | 15,737 | 610 | 246 | completion | pg-pai | si | 29,812 | 2012 | 23.04 | 2012-09-24 | 15,607 | 2012-10-05 | 15,618 | 11 | 148 | early dropout | pg-pai | no | 21856_29812 | 0 | 0 | 0 | 130 | 0 | 0 | |
| 68,152 | 24 | 2015 | 55.21 | 2014-07-08 | 16,259 | 2015-01-29 | 16,464 | 205 | 428 | referral | pg-pab | si | 71,405 | 2015 | 55.72 | 2015-01-07 | 16,442 | 2015-07-30 | 16,646 | 204 | 243 | referral | m-pr | si | 68152_71405 | 0 | 0 | both yes | 1 | 22 | 0 | 0 |
| 71,405 | 24 | 2015 | 55.72 | 2015-01-07 | 16,442 | 2015-07-30 | 16,646 | 204 | 243 | referral | m-pr | si | 80,579 | 2015 | 56.19 | 2015-06-30 | 16,616 | 2015-11-24 | 16,763 | 147 | 428 | referral | pg-pab | si | 71405_80579 | 0 | 0 | both yes | 1 | 30 | 0 | 0 |
| 80,579 | 24 | 2015 | 56.19 | 2015-06-30 | 16,616 | 2015-11-24 | 16,763 | 147 | 428 | referral | pg-pab | si | 90,940 | 2016 | 56.56 | 2015-11-10 | 16,749 | 2016-03-11 | 16,871 | 122 | 260 | late dropout | pg-pai | si | 80579_90940 | 0 | 1 | both yes | 1 | 14 | 0 | 0 |
| 3,961 | 25 | 2010 | 37.41 | 2010-03-22 | 14,690 | 2010-04-15 | 14,714 | 24 | 123 | referral | pg-pai | si | 30,636 | 2012 | 37.18 | 2009-12-28 | 14,606 | 2012-11-13 | 15,657 | 1,051 | 123 | referral | pg-pai | si | 3961_30636 | 1 | 1 | both yes | 1 | 108 | 1 | 1 |
| 5,127 | 25 | 2010 | 37.48 | 2010-04-16 | 14,715 | 2010-07-18 | 14,808 | 93 | 117 | late dropout | pg-pr | si | 30,636 | 2012 | 37.18 | 2009-12-28 | 14,606 | 2012-11-13 | 15,657 | 1,051 | 123 | referral | pg-pai | si | 5127_30636 | 0 | 1 | both yes | 0 | 202 | 1 | 1 |
| 14,074 | 26 | 2011 | 23.27 | 2008-11-25 | 14,208 | 2011-02-28 | 15,033 | 825 | 109 | referral | pg-pai | si | 23,472 | 2012 | 23.35 | 2008-12-25 | 14,238 | 2012-01-31 | 15,370 | 1,132 | 109 | referral | pg-pab | si | 14074_23472 | 1 | 1 | both yes | 1 | 795 | 1 | 0 |
| 14,074 | 26 | 2011 | 23.27 | 2008-11-25 | 14,208 | 2011-02-28 | 15,033 | 825 | 109 | referral | pg-pai | si | 25,150 | 2012 | 23.53 | 2009-03-02 | 14,305 | 2012-03-21 | 15,420 | 1,115 | 109 | referral | pg-pai | si | 14074_25150 | 1 | 1 | both yes | 1 | 728 | 1 | 0 |
| 23,472 | 26 | 2012 | 23.35 | 2008-12-25 | 14,238 | 2012-01-31 | 15,370 | 1,132 | 109 | referral | pg-pab | si | 25,150 | 2012 | 23.53 | 2009-03-02 | 14,305 | 2012-03-21 | 15,420 | 1,115 | 109 | referral | pg-pai | si | 23472_25150 | 1 | 0 | both yes | 1 | 1,065 | 0 | 0 |
| 14,882 | 26 | 2011 | 25.55 | 2011-03-07 | 15,040 | 2011-06-30 | 15,155 | 115 | 109 | referral | pg-pab | si | 23,472 | 2012 | 23.35 | 2008-12-25 | 14,238 | 2012-01-31 | 15,370 | 1,132 | 109 | referral | pg-pab | si | 14882_23472 | 1 | 1 | both yes | 1 | 917 | 1 | 1 |
| 14,882 | 26 | 2011 | 25.55 | 2011-03-07 | 15,040 | 2011-06-30 | 15,155 | 115 | 109 | referral | pg-pab | si | 25,150 | 2012 | 23.53 | 2009-03-02 | 14,305 | 2012-03-21 | 15,420 | 1,115 | 109 | referral | pg-pai | si | 14882_25150 | 1 | 1 | both yes | 1 | 850 | 1 | 1 |
| 17,443 | 26 | 2011 | 25.86 | 2011-07-01 | 15,156 | 2011-08-01 | 15,187 | 31 | 109 | referral | pg-pai | si | 23,472 | 2012 | 23.35 | 2008-12-25 | 14,238 | 2012-01-31 | 15,370 | 1,132 | 109 | referral | pg-pab | si | 17443_23472 | 1 | 1 | both yes | 1 | 949 | 1 | 1 |
| 17,443 | 26 | 2011 | 25.86 | 2011-07-01 | 15,156 | 2011-08-01 | 15,187 | 31 | 109 | referral | pg-pai | si | 25,150 | 2012 | 23.53 | 2009-03-02 | 14,305 | 2012-03-21 | 15,420 | 1,115 | 109 | referral | pg-pai | si | 17443_25150 | 1 | 1 | both yes | 1 | 882 | 1 | 1 |
| 38,847 | 27 | 2013 | 22.41 | 2011-10-19 | 15,266 | 2013-05-29 | 15,854 | 588 | 109 | referral | pg-pai | si | 40,104 | 2013 | 23.96 | 2013-05-07 | 15,832 | 2013-08-22 | 15,939 | 107 | 117 | late adm discharge | pg-pr | no | 38847_40104 | 0 | 0 | 1 | 22 | 0 | 0 | |
| 26,405 | 27 | 2012 | 22.91 | 2012-04-18 | 15,448 | 2012-08-27 | 15,579 | 131 | 117 | late adm discharge | pg-pr | si | 38,847 | 2013 | 22.41 | 2011-10-19 | 15,266 | 2013-05-29 | 15,854 | 588 | 109 | referral | pg-pai | si | 26405_38847 | 0 | 1 | both yes | 0 | 313 | 1 | 1 |
| 30,714 | 27 | 2012 | 23.36 | 2012-09-28 | 15,611 | 2013-03-29 | 15,793 | 182 | 109 | referral | pg-pab | si | 38,847 | 2013 | 22.41 | 2011-10-19 | 15,266 | 2013-05-29 | 15,854 | 588 | 109 | referral | pg-pai | si | 30714_38847 | 1 | 1 | both yes | 1 | 527 | 1 | 1 |
| 163,071 | 28 | 2019 | 29.80 | 2019-02-25 | 17,952 | 2019-07-31 | 18,108 | 156 | 408 | referral | pg-pai | si | 173,888 | 2020 | 29.87 | 2019-03-21 | 17,976 | 2020-07-31 | 18,474 | 498 | 342 | referral | m-pai | si | 163071_173888 | 0 | 1 | both yes | 1 | 132 | 1 | 0 |
| 173,888 | 28 | 2020 | 29.87 | 2019-03-21 | 17,976 | 2020-07-31 | 18,474 | 498 | 342 | referral | m-pai | si | 190,835 | 2021 | 31.03 | 2020-05-19 | 18,401 | 2021-02-25 | 18,683 | 282 | 408 | completion | pg-pai | si | 173888_190835 | 0 | 1 | both yes | 1 | 73 | 0 | 0 |
| 13,414 | 29 | 2011 | 34.92 | 2009-01-27 | 14,271 | 2011-01-31 | 15,005 | 734 | 109 | referral | pg-pab | si | 27,700 | 2012 | 35.71 | 2009-11-11 | 14,559 | 2012-10-31 | 15,644 | 1,085 | 109 | referral | pg-pab | si | 13414_27700 | 1 | 1 | both yes | 1 | 446 | 1 | 0 |
| 21,030 | 29 | 2012 | 36.94 | 2011-02-01 | 15,006 | 2012-05-31 | 15,491 | 485 | 109 | referral | pg-pai | si | 27,700 | 2012 | 35.71 | 2009-11-11 | 14,559 | 2012-10-31 | 15,644 | 1,085 | 109 | referral | pg-pab | si | 21030_27700 | 1 | 0 | both yes | 1 | 932 | 1 | 1 |
| 43,675 | 30 | 2013 | 31.79 | 2013-08-07 | 15,924 | 2013-11-15 | 16,024 | 100 | 433 | late dropout | pg-pab | si | 66,054 | 2015 | 32.05 | 2013-11-11 | 16,020 | 2015-03-03 | 16,497 | 477 | 438 | referral | m-pai | si | 43675_66054 | 0 | 1 | both yes | 0 | 4 | 1 | 0 |
| 66,054 | 30 | 2015 | 32.05 | 2013-11-11 | 16,020 | 2015-03-03 | 16,497 | 477 | 438 | referral | m-pai | si | 73,791 | 2015 | 33.36 | 2015-03-02 | 16,496 | 2015-03-24 | 16,518 | 22 | 159 | early dropout | m-pr | si | 66054_73791 | 0 | 0 | both yes | 1 | 1 | 0 | 0 |
| 16,905 | 31 | 2011 | 29.22 | 2008-07-08 | 14,068 | 2012-01-27 | 15,366 | 1,298 | 123 | referral | pg-pab | si | 28,504 | 2012 | 30.29 | 2009-08-06 | 14,462 | 2012-09-28 | 15,611 | 1,149 | 123 | late dropout | pg-pab | si | 16905_28504 | 1 | 1 | both yes | 1 | 904 | 0 | 0 |
| 8,967 | 31 | 2010 | 31.59 | 2010-11-24 | 14,937 | 2010-12-20 | 14,963 | 26 | 123 | referral | pg-pai | si | 16,905 | 2011 | 29.22 | 2008-07-08 | 14,068 | 2012-01-27 | 15,366 | 1,298 | 123 | referral | pg-pab | si | 8967_16905 | 1 | 1 | both yes | 1 | 895 | 1 | 1 |
| 8,967 | 31 | 2010 | 31.59 | 2010-11-24 | 14,937 | 2010-12-20 | 14,963 | 26 | 123 | referral | pg-pai | si | 28,504 | 2012 | 30.29 | 2009-08-06 | 14,462 | 2012-09-28 | 15,611 | 1,149 | 123 | late dropout | pg-pab | si | 8967_28504 | 1 | 1 | both yes | 1 | 501 | 1 | 1 |
| 13,338 | 31 | 2011 | 31.67 | 2010-12-20 | 14,963 | 2011-01-24 | 14,998 | 35 | 117 | early adm discharge | pg-pr | si | 16,905 | 2011 | 29.22 | 2008-07-08 | 14,068 | 2012-01-27 | 15,366 | 1,298 | 123 | referral | pg-pab | si | 13338_16905 | 0 | 0 | both yes | 0 | 930 | 1 | 1 |
| 13,338 | 31 | 2011 | 31.67 | 2010-12-20 | 14,963 | 2011-01-24 | 14,998 | 35 | 117 | early adm discharge | pg-pr | si | 28,504 | 2012 | 30.29 | 2009-08-06 | 14,462 | 2012-09-28 | 15,611 | 1,149 | 123 | late dropout | pg-pab | si | 13338_28504 | 0 | 1 | both yes | 0 | 536 | 1 | 1 |
| 35,639 | 32 | 2013 | 22.90 | 2013-01-18 | 15,723 | 2013-12-30 | 16,069 | 346 | 225 | completion | pg-pai | si | 38,713 | 2013 | 23.15 | 2013-04-22 | 15,817 | 2013-09-27 | 15,975 | 158 | 255 | late dropout | pg-pab | si | 35639_38713 | 0 | 0 | both yes | 0 | 252 | 0 | 0 |
| 35,639 | 32 | 2013 | 22.90 | 2013-01-18 | 15,723 | 2013-12-30 | 16,069 | 346 | 225 | completion | pg-pai | si | 51,795 | 2014 | 23.66 | 2013-10-23 | 16,001 | 2014-01-30 | 16,100 | 99 | 255 | late dropout | pg-pab | si | 35639_51795 | 0 | 1 | both yes | 0 | 68 | 0 | 0 |
| 53,516 | 33 | 2014 | 25.98 | 2014-01-07 | 16,077 | 2014-04-01 | 16,161 | 84 | 146 | early dropout | pg-pab | si | 87,420 | 2016 | 24.82 | 2012-11-09 | 15,653 | 2016-05-18 | 16,939 | 1,286 | 146 | referral | pg-pai | si | 53516_87420 | 1 | 1 | both yes | 0 | 508 | 1 | 1 |
| 73,632 | 33 | 2015 | 27.01 | 2015-01-16 | 16,451 | 2015-04-30 | 16,555 | 104 | 146 | referral | pg-pr | no | 87,420 | 2016 | 24.82 | 2012-11-09 | 15,653 | 2016-05-18 | 16,939 | 1,286 | 146 | referral | pg-pai | si | 73632_87420 | 1 | 1 | second yes | 1 | 902 | 1 | 1 |
| 54,900 | 34 | 2014 | 20.16 | 2014-02-03 | 16,104 | 2014-04-29 | 16,189 | 85 | 212 | referral | pg-pai | si | 57,107 | 2014 | 20.34 | 2014-04-10 | 16,170 | 2014-06-24 | 16,245 | 75 | 354 | early dropout | pg-pr | si | 54900_57107 | 0 | 0 | both yes | 1 | 19 | 0 | 0 |
| 57,107 | 34 | 2014 | 20.34 | 2014-04-10 | 16,170 | 2014-06-24 | 16,245 | 75 | 354 | early dropout | pg-pr | si | 58,996 | 2014 | 20.53 | 2014-06-19 | 16,240 | 2014-07-10 | 16,261 | 21 | 212 | early dropout | pg-pai | si | 57107_58996 | 0 | 0 | both yes | 0 | 5 | 0 | 0 |
| 3,282 | 35 | 2010 | 37.56 | 2009-10-05 | 14,522 | 2010-01-04 | 14,613 | 91 | 109 | late dropout | pg-pai | si | 3,871 | 2010 | 37.14 | 2009-05-05 | 14,369 | 2011-06-09 | 15,134 | 765 | 109 | late dropout | pg-pab | si | 3282_3871 | 1 | 0 | both yes | 0 | 244 | 1 | 1 |
| 2,632 | 35 | 2010 | 37.58 | 2009-10-12 | 14,529 | 2010-01-04 | 14,613 | 84 | 109 | early dropout | pg-pab | si | 3,282 | 2010 | 37.56 | 2009-10-05 | 14,522 | 2010-01-04 | 14,613 | 91 | 109 | late dropout | pg-pai | si | 2632_3282 | 1 | 0 | both yes | 0 | 91 | 1 | 0 |
| 2,632 | 35 | 2010 | 37.58 | 2009-10-12 | 14,529 | 2010-01-04 | 14,613 | 84 | 109 | early dropout | pg-pab | si | 3,871 | 2010 | 37.14 | 2009-05-05 | 14,369 | 2011-06-09 | 15,134 | 765 | 109 | late dropout | pg-pab | si | 2632_3871 | 1 | 0 | both yes | 0 | 244 | 1 | 1 |
| 70,771 | 36 | 2015 | 21.16 | 2014-12-05 | 16,409 | 2015-05-01 | 16,556 | 147 | 202 | referral | pg-pai | si | 87,269 | 2016 | 21.56 | 2015-04-29 | 16,554 | 2016-05-29 | 16,950 | 396 | 215 | completion | pg-pr | si | 70771_87269 | 0 | 1 | both yes | 1 | 2 | 1 | 0 |
| 87,269 | 36 | 2016 | 21.56 | 2015-04-29 | 16,554 | 2016-05-29 | 16,950 | 396 | 215 | completion | pg-pr | si | 90,126 | 2016 | 22.04 | 2015-10-21 | 16,729 | 2016-11-28 | 17,133 | 404 | 215 | completion | pg-pr | si | 87269_90126 | 1 | 0 | both yes | 0 | 221 | 1 | 0 |
| 34,867 | 37 | 2013 | 35.76 | 2012-11-01 | 15,645 | 2013-12-23 | 16,062 | 417 | 179 | completion | pg-pr | si | 54,152 | 2014 | 36.89 | 2013-12-16 | 16,055 | 2014-06-02 | 16,223 | 168 | 365 | referral | pg-pai | si | 34867_54152 | 0 | 1 | both yes | 0 | 7 | 0 | 0 |
| 54,152 | 37 | 2014 | 36.89 | 2013-12-16 | 16,055 | 2014-06-02 | 16,223 | 168 | 365 | referral | pg-pai | si | 58,961 | 2014 | 37.34 | 2014-06-01 | 16,222 | 2014-09-01 | 16,314 | 92 | 179 | late dropout | pg-pr | si | 54152_58961 | 0 | 0 | both yes | 1 | 1 | 0 | 0 |
| 2,678 | 38 | 2010 | 45.60 | 2009-09-29 | 14,516 | 2010-04-30 | 14,729 | 213 | 109 | referral | pg-pab | si | 5,505 | 2010 | 45.67 | 2009-10-26 | 14,543 | 2010-05-31 | 14,760 | 217 | 109 | referral | pg-pai | si | 2678_5505 | 1 | 0 | both yes | 1 | 186 | 1 | 0 |
| 2,678 | 38 | 2010 | 45.60 | 2009-09-29 | 14,516 | 2010-04-30 | 14,729 | 213 | 109 | referral | pg-pab | si | 10,716 | 2011 | 44.93 | 2009-01-29 | 14,273 | 2011-10-31 | 15,278 | 1,005 | 109 | referral | pg-pab | si | 2678_10716 | 1 | 1 | both yes | 1 | 456 | 1 | 1 |
| 5,505 | 38 | 2010 | 45.67 | 2009-10-26 | 14,543 | 2010-05-31 | 14,760 | 217 | 109 | referral | pg-pai | si | 10,716 | 2011 | 44.93 | 2009-01-29 | 14,273 | 2011-10-31 | 15,278 | 1,005 | 109 | referral | pg-pab | si | 5505_10716 | 1 | 1 | both yes | 1 | 487 | 1 | 1 |
| 5,359 | 39 | 2010 | 40.66 | 2010-05-03 | 14,732 | 2010-09-30 | 14,882 | 150 | 251 | referral | pg-pab | si | 11,825 | 2011 | 41.07 | 2010-09-29 | 14,881 | 2011-01-21 | 14,995 | 114 | 260 | referral | pg-pai | si | 5359_11825 | 0 | 1 | both yes | 1 | 1 | 0 | 0 |
| 11,825 | 39 | 2011 | 41.07 | 2010-09-29 | 14,881 | 2011-01-21 | 14,995 | 114 | 260 | referral | pg-pai | si | 13,130 | 2011 | 41.36 | 2011-01-14 | 14,988 | 2011-09-18 | 15,235 | 247 | 234 | completion | m-pr | si | 11825_13130 | 0 | 0 | both yes | 1 | 7 | 1 | 0 |
| 68,328 | 40 | 2015 | 26.25 | 2014-08-18 | 16,300 | 2015-08-03 | 16,650 | 350 | 430 | late dropout | pg-pai | si | 75,755 | 2015 | 26.93 | 2015-04-27 | 16,552 | 2015-06-16 | 16,602 | 50 | 488 | early dropout | m-pr | no | 68328_75755 | 0 | 0 | 0 | 98 | 0 | 0 | |
| 68,328 | 40 | 2015 | 26.25 | 2014-08-18 | 16,300 | 2015-08-03 | 16,650 | 350 | 430 | late dropout | pg-pai | si | 80,472 | 2015 | 27.20 | 2015-08-01 | 16,648 | 2015-10-07 | 16,715 | 67 | 161 | referral | m-pai | si | 68328_80472 | 0 | 0 | both yes | 0 | 2 | 0 | 0 |
| 36,873 | 41 | 2013 | 27.84 | 2013-01-21 | 15,726 | 2013-06-24 | 15,880 | 154 | 155 | late dropout | pg-pab | si | 40,999 | 2013 | 28.23 | 2013-06-12 | 15,868 | 2013-06-14 | 15,870 | 2 | 147 | early dropout | pg-pr | no | 36873_40999 | 0 | 0 | 0 | 12 | 0 | 0 | |
| 36,873 | 41 | 2013 | 27.84 | 2013-01-21 | 15,726 | 2013-06-24 | 15,880 | 154 | 155 | late dropout | pg-pab | si | 42,419 | 2013 | 28.21 | 2013-06-06 | 15,862 | 2013-11-29 | 16,038 | 176 | 254 | late adm discharge | pg-pai | si | 36873_42419 | 0 | 0 | both yes | 0 | 18 | 1 | 0 |
| 40,999 | 41 | 2013 | 28.23 | 2013-06-12 | 15,868 | 2013-06-14 | 15,870 | 2 | 147 | early dropout | pg-pr | no | 42,419 | 2013 | 28.21 | 2013-06-06 | 15,862 | 2013-11-29 | 16,038 | 176 | 254 | late adm discharge | pg-pai | si | 40999_42419 | 0 | 0 | second yes | 0 | 8 | 1 | 1 |
| 17,363 | 42 | 2011 | 33.04 | 2011-07-14 | 15,169 | 2011-09-13 | 15,230 | 61 | 262 | referral | m-pai | si | 18,983 | 2011 | 33.20 | 2011-09-12 | 15,229 | 2011-11-17 | 15,295 | 66 | 246 | referral | pg-pai | si | 17363_18983 | 0 | 0 | both yes | 1 | 1 | 1 | 0 |
| 18,983 | 42 | 2011 | 33.20 | 2011-09-12 | 15,229 | 2011-11-17 | 15,295 | 66 | 246 | referral | pg-pai | si | 20,194 | 2011 | 33.38 | 2011-11-15 | 15,293 | 2011-11-28 | 15,306 | 13 | 243 | early adm discharge | m-pr | si | 18983_20194 | 0 | 0 | both yes | 1 | 2 | 0 | 0 |
| 11,071 | 43 | 2011 | 19.82 | 2010-08-20 | 14,841 | 2011-01-24 | 14,998 | 157 | 195 | late adm discharge | pg-pab | si | 15,672 | 2011 | 20.22 | 2011-01-12 | 14,986 | 2019-12-31 | 18,261 | 3,275 | 269 | adm truncated | pg-pr | no | 11071_15672 | 0 | 0 | 0 | 12 | 1 | 0 | |
| 15,672 | 43 | 2011 | 20.22 | 2011-01-12 | 14,986 | 2019-12-31 | 18,261 | 3,275 | 269 | adm truncated | pg-pr | no | 20,199 | 2011 | 20.70 | 2011-07-05 | 15,160 | 2019-12-31 | 18,261 | 3,101 | 269 | adm truncated | pg-pr | no | 15672_20199 | 1 | 0 | both no | 0 | 3,101 | 0 | 0 |
| 209,491 | 44 | 2022 | 40.20 | 2021-10-01 | 18,901 | 2022-07-29 | 19,202 | 301 | 168 | referral | m-pai | si | 216,140 | 2022 | 40.86 | 2022-05-30 | 19,142 | 2022-06-20 | 19,163 | 21 | 795 | referral | m-pr | no | 209491_216140 | 0 | 0 | 1 | 60 | 0 | 0 | |
| 209,491 | 44 | 2022 | 40.20 | 2021-10-01 | 18,901 | 2022-07-29 | 19,202 | 301 | 168 | referral | m-pai | si | 216,999 | 2022 | 40.94 | 2022-06-29 | 19,172 | 2022-08-24 | 19,228 | 56 | 795 | early dropout | m-pai | si | 209491_216999 | 0 | 0 | both yes | 1 | 30 | 0 | 0 |
| 9,504 | 45 | 2010 | 28.01 | 2010-12-01 | 14,944 | 2010-12-31 | 14,974 | 30 | 185 | referral | pg-pab | si | 12,774 | 2011 | 28.04 | 2010-12-13 | 14,956 | 2011-10-20 | 15,267 | 311 | 185 | referral | pg-pai | si | 9504_12774 | 1 | 1 | both yes | 1 | 18 | 1 | 0 |
| 12,774 | 45 | 2011 | 28.04 | 2010-12-13 | 14,956 | 2011-10-20 | 15,267 | 311 | 185 | referral | pg-pai | si | 22,564 | 2012 | 28.89 | 2011-10-19 | 15,266 | 2012-03-30 | 15,429 | 163 | 197 | late adm discharge | m-pr | no | 12774_22564 | 0 | 1 | 1 | 1 | 0 | 0 | |
| 43,099 | 46 | 2013 | 40.11 | 2013-08-02 | 15,919 | 2013-11-14 | 16,023 | 104 | 469 | referral | pg-pab | si | 51,830 | 2014 | 40.33 | 2013-10-22 | 16,000 | 2014-08-06 | 16,288 | 288 | 137 | completion | m-pai | si | 43099_51830 | 0 | 1 | both yes | 1 | 23 | 1 | 0 |
| 51,830 | 46 | 2014 | 40.33 | 2013-10-22 | 16,000 | 2014-08-06 | 16,288 | 288 | 137 | completion | m-pai | si | 68,070 | 2015 | 41.11 | 2014-08-04 | 16,286 | 2015-10-21 | 16,729 | 443 | 561 | completion | pg-pai | si | 51830_68070 | 0 | 1 | both yes | 0 | 2 | 1 | 0 |
| 2,429 | 47 | 2010 | 40.19 | 2010-01-25 | 14,634 | 2010-04-16 | 14,715 | 81 | 219 | early dropout | m-pr | si | 22,971 | 2012 | 39.03 | 2008-11-28 | 14,211 | 2012-08-30 | 15,582 | 1,371 | 123 | referral | pg-pab | si | 2429_22971 | 0 | 1 | both yes | 0 | 504 | 1 | 1 |
| 10,739 | 47 | 2011 | 40.62 | 2010-07-01 | 14,791 | 2011-04-19 | 15,083 | 292 | 205 | late adm discharge | pg-pai | si | 22,971 | 2012 | 39.03 | 2008-11-28 | 14,211 | 2012-08-30 | 15,582 | 1,371 | 123 | referral | pg-pab | si | 10739_22971 | 0 | 1 | both yes | 0 | 872 | 1 | 1 |
| 60,019 | 47 | 2014 | 44.61 | 2014-06-26 | 16,247 | 2014-10-30 | 16,373 | 126 | 353 | referral | pg-pai | si | 69,890 | 2015 | 44.94 | 2014-10-28 | 16,371 | 2015-01-12 | 16,447 | 76 | 345 | referral | m-pr | si | 60019_69890 | 0 | 1 | both yes | 1 | 2 | 0 | 0 |
| 69,890 | 47 | 2015 | 44.94 | 2014-10-28 | 16,371 | 2015-01-12 | 16,447 | 76 | 345 | referral | m-pr | si | 72,004 | 2015 | 45.13 | 2015-01-02 | 16,437 | 2015-06-19 | 16,605 | 168 | 347 | referral | pg-pai | si | 69890_72004 | 0 | 0 | both yes | 1 | 10 | 1 | 0 |
| 208,911 | 48 | 2022 | 54.74 | 2021-09-14 | 18,884 | 2022-02-08 | 19,031 | 147 | 750 | referral | pg-pab | si | 212,965 | 2022 | 55.05 | 2022-01-05 | 18,997 | 2022-11-02 | 19,298 | 301 | 272 | referral | pg-pai | si | 208911_212965 | 0 | 0 | both yes | 1 | 34 | 1 | 0 |
| 212,833 | 48 | 2022 | 55.15 | 2022-02-09 | 19,032 | 2022-02-21 | 19,044 | 12 | 272 | early dropout | pg-pai | si | 212,965 | 2022 | 55.05 | 2022-01-05 | 18,997 | 2022-11-02 | 19,298 | 301 | 272 | referral | pg-pai | si | 212833_212965 | 1 | 0 | both yes | 0 | 47 | 1 | 1 |
| 27,819 | 49 | 2012 | 35.06 | 2012-06-20 | 15,511 | 2012-07-30 | 15,551 | 40 | 204 | referral | pg-pab | si | 28,664 | 2012 | 35.12 | 2012-07-12 | 15,533 | 2012-08-16 | 15,568 | 35 | 205 | referral | pg-pai | si | 27819_28664 | 0 | 0 | both yes | 1 | 18 | 0 | 0 |
| 28,664 | 49 | 2012 | 35.12 | 2012-07-12 | 15,533 | 2012-08-16 | 15,568 | 35 | 205 | referral | pg-pai | si | 29,097 | 2012 | 35.19 | 2012-08-07 | 15,559 | 2012-08-16 | 15,568 | 9 | 215 | early dropout | pg-pr | si | 28664_29097 | 0 | 0 | both yes | 1 | 9 | 0 | 0 |
| 1,281 | 50 | 2010 | 33.64 | 2009-12-23 | 14,601 | 2010-05-03 | 14,732 | 131 | 249 | late dropout | pg-pab | si | 10,331 | 2011 | 33.90 | 2010-03-29 | 14,697 | 2011-03-02 | 15,035 | 338 | 300 | completion | m-pr | no | 1281_10331 | 0 | 1 | 0 | 35 | 1 | 0 | |
| 10,331 | 50 | 2011 | 33.90 | 2010-03-29 | 14,697 | 2011-03-02 | 15,035 | 338 | 300 | completion | m-pr | no | 13,983 | 2011 | 34.81 | 2011-02-24 | 15,029 | 2011-06-07 | 15,132 | 103 | 262 | late dropout | m-pab | si | 10331_13983 | 0 | 0 | second yes | 0 | 6 | 0 | 0 |
| 50,065 | 51 | 2014 | 37.88 | 2013-07-31 | 15,917 | 2014-03-21 | 16,150 | 233 | 125 | referral | pg-pai | si | 55,877 | 2014 | 38.47 | 2014-03-05 | 16,134 | 2014-06-16 | 16,237 | 103 | 125 | referral | pg-pab | si | 50065_55877 | 1 | 0 | both yes | 1 | 16 | 0 | 0 |
| 55,877 | 51 | 2014 | 38.47 | 2014-03-05 | 16,134 | 2014-06-16 | 16,237 | 103 | 125 | referral | pg-pab | si | 85,681 | 2016 | 38.68 | 2014-05-19 | 16,209 | 2016-05-16 | 16,937 | 728 | 353 | completion | pg-pai | no | 55877_85681 | 0 | 1 | 1 | 28 | 1 | 0 | |
| 25,237 | 52 | 2012 | 38.88 | 2012-03-06 | 15,405 | 2012-04-12 | 15,442 | 37 | 171 | referral | pg-pai | si | 65,490 | 2015 | 37.64 | 2010-12-10 | 14,953 | 2015-05-29 | 16,584 | 1,631 | 166 | completion | pg-pai | si | 25237_65490 | 0 | 1 | both yes | 1 | 489 | 1 | 1 |
| 33,552 | 52 | 2013 | 39.29 | 2012-08-01 | 15,553 | 2013-02-01 | 15,737 | 184 | 171 | referral | pg-pai | si | 65,490 | 2015 | 37.64 | 2010-12-10 | 14,953 | 2015-05-29 | 16,584 | 1,631 | 166 | completion | pg-pai | si | 33552_65490 | 0 | 1 | both yes | 1 | 784 | 1 | 1 |
| 24,667 | 53 | 2012 | 31.20 | 2012-02-15 | 15,385 | 2012-05-18 | 15,478 | 93 | 161 | referral | m-pai | si | 27,109 | 2012 | 31.45 | 2012-05-17 | 15,477 | 2012-10-03 | 15,616 | 139 | 275 | late dropout | m-pr | si | 24667_27109 | 0 | 0 | both yes | 1 | 1 | 1 | 0 |
| 27,109 | 53 | 2012 | 31.45 | 2012-05-17 | 15,477 | 2012-10-03 | 15,616 | 139 | 275 | late dropout | m-pr | si | 48,001 | 2014 | 31.83 | 2012-10-02 | 15,615 | 2015-02-16 | 16,482 | 867 | 161 | late dropout | m-pai | si | 27109_48001 | 0 | 1 | both yes | 0 | 1 | 1 | 0 |
| 58,442 | 54 | 2014 | 34.85 | 2014-06-04 | 16,225 | 2014-09-25 | 16,338 | 113 | 225 | referral | pg-pab | si | 62,112 | 2014 | 35.15 | 2014-09-22 | 16,335 | 2014-10-30 | 16,373 | 38 | 235 | early dropout | pg-pr | si | 58442_62112 | 0 | 0 | both yes | 1 | 3 | 0 | 0 |
| 62,112 | 54 | 2014 | 35.15 | 2014-09-22 | 16,335 | 2014-10-30 | 16,373 | 38 | 235 | early dropout | pg-pr | si | 63,636 | 2014 | 35.22 | 2014-10-16 | 16,359 | 2015-03-16 | 16,510 | 151 | 225 | late dropout | pg-pab | si | 62112_63636 | 0 | 0 | both yes | 0 | 14 | 1 | 0 |
| 194 | 55 | 2010 | 23.63 | 2009-10-08 | 14,525 | 19,475 | 275 | early dropout | m-pr | si | 4,670 | 2010 | 24.19 | 2010-04-29 | 14,728 | 2010-05-03 | 14,732 | 4 | 159 | early dropout | m-pr | si | 194_4670 | 0 | 0 | both yes | 0 | 4,747 | 0 | |||
| 194 | 55 | 2010 | 23.63 | 2009-10-08 | 14,525 | 19,475 | 275 | early dropout | m-pr | si | 24,666 | 2012 | 25.97 | 2012-02-10 | 15,380 | 2012-06-01 | 15,492 | 112 | 161 | referral | m-pai | si | 194_24666 | 0 | 1 | both yes | 0 | 4,095 | 0 | |||
| 194 | 55 | 2010 | 23.63 | 2009-10-08 | 14,525 | 19,475 | 275 | early dropout | m-pr | si | 200,171 | 2021 | 35.39 | 2021-07-13 | 18,821 | 2021-09-01 | 18,871 | 50 | 143 | early dropout | pg-pai | si | 194_200171 | 0 | 1 | both yes | 0 | 654 | 0 | |||
| 35,516 | 56 | 2013 | 34.71 | 2013-01-14 | 15,719 | 2013-04-29 | 15,824 | 105 | 249 | referral | pg-pab | si | 37,922 | 2013 | 34.83 | 2013-02-25 | 15,761 | 2013-05-01 | 15,826 | 65 | 238 | referral | pg-pai | no | 35516_37922 | 0 | 0 | 1 | 63 | 0 | 0 | |
| 35,516 | 56 | 2013 | 34.71 | 2013-01-14 | 15,719 | 2013-04-29 | 15,824 | 105 | 249 | referral | pg-pab | si | 39,524 | 2013 | 34.98 | 2013-04-22 | 15,817 | 2013-06-17 | 15,873 | 56 | 285 | early dropout | pg-pr | si | 35516_39524 | 0 | 0 | both yes | 1 | 7 | 0 | 0 |
| 37,922 | 56 | 2013 | 34.83 | 2013-02-25 | 15,761 | 2013-05-01 | 15,826 | 65 | 238 | referral | pg-pai | no | 39,524 | 2013 | 34.98 | 2013-04-22 | 15,817 | 2013-06-17 | 15,873 | 56 | 285 | early dropout | pg-pr | si | 37922_39524 | 0 | 0 | second yes | 1 | 9 | 0 | 0 |
| 2,603 | 57 | 2010 | 23.74 | 2010-01-20 | 14,629 | 19,475 | 117 | early dropout | pg-pr | si | 24,854 | 2012 | 25.81 | 2012-02-16 | 15,386 | 2012-03-07 | 15,406 | 20 | 120 | referral | pg-pai | si | 2603_24854 | 0 | 1 | both yes | 0 | 4,089 | 0 | |||
| 2,603 | 57 | 2010 | 23.74 | 2010-01-20 | 14,629 | 19,475 | 117 | early dropout | pg-pr | si | 25,221 | 2012 | 25.87 | 2012-03-08 | 15,407 | 2012-05-08 | 15,468 | 61 | 362 | early dropout | m-pr | si | 2603_25221 | 0 | 1 | both yes | 0 | 4,068 | 0 | |||
| 42,920 | 58 | 2013 | 39.48 | 2012-08-23 | 15,575 | 2013-11-01 | 16,010 | 435 | 248 | referral | pg-pai | si | 51,617 | 2014 | 40.65 | 2013-10-24 | 16,002 | 2014-04-20 | 16,180 | 178 | 271 | late adm discharge | pg-pr | si | 42920_51617 | 0 | 1 | both yes | 1 | 8 | 0 | 0 |
| 36,594 | 58 | 2013 | 39.84 | 2013-01-02 | 15,707 | 2013-08-01 | 15,918 | 211 | 248 | late adm discharge | pg-pai | no | 42,920 | 2013 | 39.48 | 2012-08-23 | 15,575 | 2013-11-01 | 16,010 | 435 | 248 | referral | pg-pai | si | 36594_42920 | 1 | 0 | second yes | 0 | 343 | 1 | 1 |
| 1,752 | 59 | 2010 | 27.99 | 2009-08-17 | 14,473 | 2010-11-30 | 14,943 | 470 | 287 | late dropout | pg-pab | si | 29,412 | 2012 | 28.03 | 2009-08-30 | 14,486 | 2013-03-12 | 15,776 | 1,290 | 287 | late dropout | pg-pab | si | 1752_29412 | 1 | 1 | both yes | 0 | 457 | 1 | 0 |
| 20,899 | 59 | 2012 | 29.37 | 2011-01-03 | 14,977 | 2012-05-02 | 15,462 | 485 | 287 | late dropout | pg-pab | si | 29,412 | 2012 | 28.03 | 2009-08-30 | 14,486 | 2013-03-12 | 15,776 | 1,290 | 287 | late dropout | pg-pab | si | 20899_29412 | 1 | 0 | both yes | 0 | 976 | 1 | 1 |
| 26,804 | 60 | 2012 | 33.75 | 2012-05-24 | 15,484 | 2012-11-27 | 15,671 | 187 | 146 | referral | pg-pab | si | 31,405 | 2012 | 34.17 | 2012-10-25 | 15,638 | 2012-12-17 | 15,691 | 53 | 142 | completion | pg-pr | si | 26804_31405 | 0 | 0 | both yes | 1 | 33 | 0 | 0 |
| 26,804 | 60 | 2012 | 33.75 | 2012-05-24 | 15,484 | 2012-11-27 | 15,671 | 187 | 146 | referral | pg-pab | si | 48,099 | 2014 | 34.26 | 2012-11-25 | 15,669 | 2014-03-31 | 16,160 | 491 | 142 | completion | m-pr | si | 26804_48099 | 0 | 1 | both yes | 1 | 2 | 1 | 0 |
| 31,405 | 60 | 2012 | 34.17 | 2012-10-25 | 15,638 | 2012-12-17 | 15,691 | 53 | 142 | completion | pg-pr | si | 48,099 | 2014 | 34.26 | 2012-11-25 | 15,669 | 2014-03-31 | 16,160 | 491 | 142 | completion | m-pr | si | 31405_48099 | 1 | 1 | both yes | 0 | 22 | 1 | 0 |
| 25,083 | 61 | 2012 | 34.80 | 2012-02-21 | 15,391 | 2012-06-04 | 15,495 | 104 | 251 | referral | pg-pab | si | 27,357 | 2012 | 35.05 | 2012-05-23 | 15,483 | 2012-11-19 | 15,663 | 180 | 260 | referral | pg-pai | si | 25083_27357 | 0 | 0 | both yes | 1 | 12 | 1 | 0 |
| 27,357 | 61 | 2012 | 35.05 | 2012-05-23 | 15,483 | 2012-11-19 | 15,663 | 180 | 260 | referral | pg-pai | si | 34,809 | 2013 | 35.53 | 2012-11-14 | 15,658 | 2013-05-31 | 15,856 | 198 | 243 | late dropout | m-pr | si | 27357_34809 | 0 | 1 | both yes | 1 | 5 | 1 | 0 |
| 73,881 | 62 | 2015 | 28.49 | 2015-03-25 | 16,519 | 2015-05-01 | 16,556 | 37 | 260 | referral | pg-pai | si | 76,076 | 2015 | 28.58 | 2015-04-27 | 16,552 | 2015-06-02 | 16,588 | 36 | 358 | early adm discharge | pg-pr | si | 73881_76076 | 0 | 0 | both yes | 1 | 4 | 0 | 0 |
| 76,076 | 62 | 2015 | 28.58 | 2015-04-27 | 16,552 | 2015-06-02 | 16,588 | 36 | 358 | early adm discharge | pg-pr | si | 77,408 | 2015 | 28.66 | 2015-05-27 | 16,582 | 2015-08-12 | 16,659 | 77 | 260 | referral | pg-pai | si | 76076_77408 | 0 | 0 | both yes | 0 | 6 | 1 | 0 |
| 8,239 | 63 | 2010 | 27.79 | 2010-10-07 | 14,889 | 19,475 | 275 | early dropout | m-pr | si | 32,167 | 2013 | 28.30 | 2011-04-12 | 15,076 | 2013-12-18 | 16,057 | 981 | 224 | late adm discharge | pg-pai | si | 8239_32167 | 0 | 1 | both yes | 0 | 4,399 | 0 | |||
| 8,239 | 63 | 2010 | 27.79 | 2010-10-07 | 14,889 | 19,475 | 275 | early dropout | m-pr | si | 97,354 | 2016 | 33.42 | 2016-05-23 | 16,944 | 2016-11-01 | 17,106 | 162 | 277 | referral | m-pr | si | 8239_97354 | 0 | 1 | both yes | 0 | 2,531 | 0 | |||
| 11,442 | 64 | 2011 | 28.84 | 2007-10-14 | 13,800 | 2011-01-31 | 15,005 | 1,205 | 136 | late adm discharge | pg-pai | no | 13,156 | 2011 | 31.80 | 2010-10-01 | 14,883 | 2011-08-27 | 15,213 | 330 | 136 | late adm discharge | pg-pai | si | 11442_13156 | 1 | 0 | second yes | 0 | 122 | 0 | 0 |
| 753 | 64 | 2010 | 30.00 | 2008-12-11 | 14,224 | 2010-02-16 | 14,656 | 432 | 136 | referral | pg-pab | si | 11,442 | 2011 | 28.84 | 2007-10-14 | 13,800 | 2011-01-31 | 15,005 | 1,205 | 136 | late adm discharge | pg-pai | no | 753_11442 | 1 | 1 | 1 | 856 | 1 | 1 | |
| 8,764 | 65 | 2010 | 40.96 | 2007-11-09 | 13,826 | 2011-07-28 | 15,183 | 1,357 | 109 | completion | pg-pab | si | 37,235 | 2013 | 43.27 | 2010-03-01 | 14,669 | 2013-02-28 | 15,764 | 1,095 | 109 | referral | pg-pab | si | 8764_37235 | 1 | 1 | both yes | 0 | 514 | 0 | 0 |
| 8,764 | 65 | 2010 | 40.96 | 2007-11-09 | 13,826 | 2011-07-28 | 15,183 | 1,357 | 109 | completion | pg-pab | si | 85,373 | 2016 | 43.35 | 2010-03-31 | 14,699 | 2016-07-01 | 16,983 | 2,284 | 109 | completion | pg-pai | si | 8764_85373 | 1 | 1 | both yes | 0 | 484 | 1 | 0 |
| 5,531 | 65 | 2010 | 42.96 | 2009-11-09 | 14,557 | 2010-09-30 | 14,882 | 325 | 109 | late adm discharge | pg-pai | si | 8,764 | 2010 | 40.96 | 2007-11-09 | 13,826 | 2011-07-28 | 15,183 | 1,357 | 109 | completion | pg-pab | si | 5531_8764 | 1 | 0 | both yes | 0 | 1,056 | 1 | 1 |
| 5,531 | 65 | 2010 | 42.96 | 2009-11-09 | 14,557 | 2010-09-30 | 14,882 | 325 | 109 | late adm discharge | pg-pai | si | 37,235 | 2013 | 43.27 | 2010-03-01 | 14,669 | 2013-02-28 | 15,764 | 1,095 | 109 | referral | pg-pab | si | 5531_37235 | 1 | 1 | both yes | 0 | 213 | 1 | 0 |
| 5,531 | 65 | 2010 | 42.96 | 2009-11-09 | 14,557 | 2010-09-30 | 14,882 | 325 | 109 | late adm discharge | pg-pai | si | 85,373 | 2016 | 43.35 | 2010-03-31 | 14,699 | 2016-07-01 | 16,983 | 2,284 | 109 | completion | pg-pai | si | 5531_85373 | 1 | 1 | both yes | 0 | 183 | 1 | 0 |
| 37,235 | 65 | 2013 | 43.27 | 2010-03-01 | 14,669 | 2013-02-28 | 15,764 | 1,095 | 109 | referral | pg-pab | si | 85,373 | 2016 | 43.35 | 2010-03-31 | 14,699 | 2016-07-01 | 16,983 | 2,284 | 109 | completion | pg-pai | si | 37235_85373 | 1 | 1 | both yes | 1 | 1,065 | 1 | 0 |
| 24,760 | 66 | 2012 | 30.15 | 2012-02-01 | 15,371 | 2012-09-03 | 15,586 | 215 | 200 | referral | pg-pai | si | 33,553 | 2013 | 30.67 | 2012-08-10 | 15,562 | 2013-08-22 | 15,939 | 377 | 183 | completion | pg-pr | no | 24760_33553 | 0 | 1 | 1 | 24 | 1 | 0 | |
| 33,553 | 66 | 2013 | 30.67 | 2012-08-10 | 15,562 | 2013-08-22 | 15,939 | 377 | 183 | completion | pg-pr | no | 43,437 | 2013 | 31.65 | 2013-08-02 | 15,919 | 2013-12-01 | 16,040 | 121 | 200 | referral | pg-pai | si | 33553_43437 | 0 | 0 | second yes | 0 | 20 | 0 | 0 |
| 17,895 | 67 | 2011 | 38.61 | 2008-08-29 | 14,120 | 2011-09-01 | 15,218 | 1,098 | 123 | late dropout | pg-pai | si | 26,234 | 2012 | 39.27 | 2009-04-27 | 14,361 | 2012-08-30 | 15,582 | 1,221 | 123 | late dropout | pg-pai | si | 17895_26234 | 1 | 1 | both yes | 0 | 857 | 1 | 0 |
| 1,821 | 67 | 2010 | 39.02 | 2009-01-27 | 14,271 | 2010-09-16 | 14,868 | 597 | 123 | referral | pg-pai | si | 17,895 | 2011 | 38.61 | 2008-08-29 | 14,120 | 2011-09-01 | 15,218 | 1,098 | 123 | late dropout | pg-pai | si | 1821_17895 | 1 | 1 | both yes | 1 | 748 | 1 | 1 |
| 1,821 | 67 | 2010 | 39.02 | 2009-01-27 | 14,271 | 2010-09-16 | 14,868 | 597 | 123 | referral | pg-pai | si | 26,234 | 2012 | 39.27 | 2009-04-27 | 14,361 | 2012-08-30 | 15,582 | 1,221 | 123 | late dropout | pg-pai | si | 1821_26234 | 1 | 1 | both yes | 1 | 507 | 1 | 0 |
| Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case; | ||||||||||||||||||||||||||||||||
| a= date; b= numeric | ||||||||||||||||||||||||||||||||
| Same Center ID= If both cases share the same Center ID | ||||||||||||||||||||||||||||||||
| Financed By SENDA= If both cases are financed by SENDA; | ||||||||||||||||||||||||||||||||
| Referral= If the cause of discharge is the referral from another center (1= Referral); | ||||||||||||||||||||||||||||||||
| Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment | ||||||||||||||||||||||||||||||||
| 2nd treatment has more treatment days= Earlier treatment has more days of treatment |
0.b.1 Overlappings due to missing discharge dates
First, we checked if a case had any missing value in the discharge date of the earlier treatment.
Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
bpmn::bpmn(paste0(wdpath, "cons/_input/overlapped_ranges_decision_tree_miss_disch_dates.bpmn"))Decision tree for overlapping due to missing discharge dates
Apply the decision tree to the overlapping cases with missing dates of discharge, first identifying scenarios.
Code
eliminate_0c_a1<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1>adm_date_2, is.na(disch_date_1), !is.na(disch_date_2)) |>
(\(df) {
cat(paste0("0c.a1.Number of cases with missing dates of discharge, first obs. within second: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.a1.Number of patients with missing dates of discharge, first obs. within second: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})() |>
tidytable::pivot_longer(
cols = matches("_[12]$"), # All columns ending with _1 or _2
names_to = c(".value", "wave"),
names_pattern = "(.+)_([12])",
values_drop_na = FALSE) |> #2025: from 1659 to 3318
mutate(wave= as.numeric(wave)) |>
filter(is.na(disch_date)) |>
pull(rn) |> as.numeric()
# 0c.a1.Number of cases with missing dates of discharge, first obs. within second: 2
# 0c.a1.Number of patients with missing dates of discharge, first obs. within second: 1
keep_0c_a1<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1>adm_date_2, is.na(disch_date_1), !is.na(disch_date_2)) |>
(\(df) {
cat(paste0("0c.a1.Number of cases with missing dates of discharge, first obs. within second: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.a1.Number of patients with missing dates of discharge, first obs. within second: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})() |>
tidytable::pivot_longer(
cols = matches("_[12]$"), # All columns ending with _1 or _2
names_to = c(".value", "wave"),
names_pattern = "(.+)_([12])",
values_drop_na = FALSE) |> #2025: from 1659 to 3318
mutate(wave= as.numeric(wave)) |>
filter(!is.na(disch_date)) |>
pull(rn) |> as.numeric()
# 0c.a1.Number of cases with missing dates of discharge, first obs. within second: 2
# 0c.a1.Number of patients with missing dates of discharge, first obs. within second: 1
replace_miss_dischdate_0c_a2<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1>adm_date_2, !is.na(disch_date_1)) |>
(\(df) {
cat(paste0("0c.a2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.a2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})() |>
mutate(disch_date_num_2_rec= adm_date_rec_num_1-1)
# 0c.a2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: 0
# 0c.a2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: 0
replace_miss_dischdate_0c_a3_a<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1>adm_date_2, is.na(disch_date_1), is.na(disch_date_2))|>
(\(df) {
cat(paste0("0c.a3.a.Number of cases with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.a3.a.Number of patients with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
mutate(disch_date_num_2_rec= adm_date_rec_num_1-1)
# 0c.a3.a.Number of cases with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: 0
# 0c.a3.a.Number of patients with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: 0
discard_cases_0c_a3_b<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1>adm_date_2, is.na(disch_date_1), is.na(disch_date_2))|>
(\(df) {
cat(paste0("0c.a3.b.Number of cases with both missing dates of discharge (db retrieval!=2022): ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.a3.b.Number of patients with both missing dates of discharge (db retrieval!=2022): ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})() |>
select(rn_1, rn_2) |> as.numeric()
# 0c.a3.b.Number of cases with both missing dates of discharge (db retrieval!=2022): 0
# 0c.a3.b.Number of patients with both missing dates of discharge (db retrieval!=2022): 0
replace_miss_dischdate_0c_b3_a<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1<adm_date_2, is.na(disch_date_1), is.na(disch_date_2),ano_bd_2==2022)|>
(\(df) {
cat(paste0("0c.b3.a.Number of cases with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.b3.a.Number of patients with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
mutate(disch_date_num_1_rec= adm_date_rec_num_2-1)
# c.b3.a.Number of cases with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: 2
# 0c.b3.a.Number of patients with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: 1
discard_0c_b3_b<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1<adm_date_2, is.na(disch_date_1), is.na(disch_date_2), ano_bd_2!=2022) |>
(\(df) {
cat(paste0("0c.b3.b.Number of cases with missing dates of discharge, both treatments are not 2022: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.b3.b.Number of patients with missing dates of discharge, both treatments are not 2022: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})() |>
select(rn_1, rn_2) |> as.numeric()
# 0c.b3.b.Number of cases with missing dates of discharge, both treatments are not 2022: 0
# 0c.b3.b.Number of patients with missing dates of discharge, both treatments are not 2022: 0
eliminate_0c_b1<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1<adm_date_2, is.na(disch_date_2), !is.na(disch_date_1)) |>
(\(df) {
cat(paste0("0c.b1.Number of cases with missing dates of discharge, second obs. within first: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.b1.Number of patients with missing dates of discharge, second obs. within first: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})() |>
tidytable::pivot_longer(
cols = matches("_[12]$"), # All columns ending with _1 or _2
names_to = c(".value", "wave"),
names_pattern = "(.+)_([12])",
values_drop_na = FALSE) |> #2025: from 1659 to 3318
mutate(wave= as.numeric(wave)) |>
arrange(pair_id) |>
filter(is.na(disch_date)) |>
pull(rn) |> as.numeric()
keep_0c_b1<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1<adm_date_2, is.na(disch_date_2), !is.na(disch_date_1)) |>
(\(df) {
cat(paste0("0c.b1.Number of cases with missing dates of discharge, second obs. within first: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.b1.Number of patients with missing dates of discharge, second obs. within first: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})() |>
tidytable::pivot_longer(
cols = matches("_[12]$"), # All columns ending with _1 or _2
names_to = c(".value", "wave"),
names_pattern = "(.+)_([12])",
values_drop_na = FALSE) |> #2025: from 1659 to 3318
mutate(wave= as.numeric(wave)) |>
arrange(pair_id) |>
filter(!is.na(disch_date)) |>
pull(rn) |> as.numeric()
# 0c.b1.Number of cases with missing dates of discharge, second obs. within first: 8
# 0c.b1.Number of patients with missing dates of discharge, second obs. within first: 4
replace_miss_dischdate_0c_b2<-
CONS_C1_df_dup_overlaps_COMP |>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(is.na(disch_date_1)|is.na(disch_date_2)) |>
filter(adm_date_1<adm_date_2, !is.na(disch_date_2)) |>
(\(df) {
cat(paste0("0c.b2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 1st disch date: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("0c.b2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 1st disch date: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})() |>
mutate(disch_date_num_1_rec= adm_date_rec_num_2-1)
# 0c.b2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 1st disch date: 4
# 0c.b2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 1st disch date: 2
invisible("If you want to explore the source of errors")
# select(replace_miss_dischdate_0c_b2, hash_key, adm_date_1, disch_date_num_1_rec) |>
# left_join(select(SISTRAT23_c1_2010_2022_df_prev1, hash_key, adm_date, fecha_egreso_de_tratamiento), by=c("hash_key"="hash_key", "adm_date_1"="adm_date"))0c.a1.Number of cases with missing dates of discharge, first obs. within second: 2
0c.a1.Number of patients with missing dates of discharge, first obs. within second: 1
0c.a1.Number of cases with missing dates of discharge, first obs. within second: 2
0c.a1.Number of patients with missing dates of discharge, first obs. within second: 1
0c.a2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: 0
0c.a2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 2nd disch date: 0
0c.a3.a.Number of cases with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: 0
0c.a3.a.Number of patients with both missing dates of discharge (db retrieval=2022), replace 2nd discharge date with admission date of the 1st: 0
0c.a3.b.Number of cases with both missing dates of discharge (db retrieval!=2022): 0
0c.a3.b.Number of patients with both missing dates of discharge (db retrieval!=2022): 0
0c.b3.a.Number of cases with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: 2
0c.b3.a.Number of patients with missing dates of discharge, admission date of 2nd tr. (db retrieval=2022) replace miss 1st disch date: 1
0c.b3.b.Number of cases with missing dates of discharge, both treatments are not 2022: 0
0c.b3.b.Number of patients with missing dates of discharge, both treatments are not 2022: 0
0c.b1.Number of cases with missing dates of discharge, second obs. within first: 8
0c.b1.Number of patients with missing dates of discharge, second obs. within first: 4
0c.b1.Number of cases with missing dates of discharge, second obs. within first: 8
0c.b1.Number of patients with missing dates of discharge, second obs. within first: 4
0c.b2.Number of cases with missing dates of discharge, admission date of first tr. replace miss 1st disch date: 4
0c.b2.Number of patients with missing dates of discharge, admission date of first tr. replace miss 1st disch date: 2
We apply the scenarios found to the main database.
Code
#eliminate_0c_a1
#keep_0c_a1
#eliminate_0c_b1
#keep_0c_b1
#discard_cases_0c_a3_b
#discard_0c_b3_b
hashkeys_overlapped_discarded_missing_dates<-
rbind.data.frame(filter(SISTRAT23_c1_2010_2022_df_prev1h, rn %in% discard_cases_0c_a3_b[!is.na(discard_cases_0c_a3_b)]),
filter(SISTRAT23_c1_2010_2022_df_prev1h, rn %in% discard_0c_b3_b[!is.na(discard_0c_b3_b)])) |> distinct(hash_key) |> pull(hash_key)
SISTRAT23_c1_2010_2022_df_prev1i<-
SISTRAT23_c1_2010_2022_df_prev1h|>
(\(df) {
cat(paste0("4. Database before correcting overlapping with missing discharge dates, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Database before correcting overlapping with missing discharge dates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
mutate(OBS= case_when(hash_key %in% hashkeys_overlapped_discarded_missing_dates~paste0(as.character(OBS),";","4.1.0c.a/b3.b.Eliminate overlapping cases with both missing dates prior to 2022"), T~ OBS))|>
filter(!rn %in% c(discard_cases_0c_a3_b, discard_0c_b3_b))|>
mutate(OBS= case_when(rn %in% c(keep_0c_a1, keep_0c_b1)~paste0(as.character(OBS),";","4.2.0c.a1/b1.Eliminate overlapping cases with episodes within others and missing discharge dates"), T~ OBS))|>
filter(!rn %in% c(eliminate_0c_a1, eliminate_0c_b1))|>
mutate(OBS= case_when(rn %in% c(replace_miss_dischdate_0c_a2$rn_2, replace_miss_dischdate_0c_b2$rn_1, replace_miss_dischdate_0c_a3_a$rn_2, replace_miss_dischdate_0c_b3_a$rn_1)~ paste0(as.character(OBS),";","4.3.0c.b2/a2/a3.a/b3.a.Replace missing discharge date with admission date of the first treatment minus 1 day"), T~ OBS)) |>
left_join(replace_miss_dischdate_0c_a2[, c("rn_2", "disch_date_num_2_rec")], by=c("rn"="rn_2"))|>
left_join(replace_miss_dischdate_0c_b2[, c("rn_1", "disch_date_num_1_rec")], by=c("rn"="rn_1"))|>
left_join(replace_miss_dischdate_0c_a3_a[, c("rn_2", "disch_date_num_2_rec")], by=c("rn"="rn_2"), suffix = c("_a2", "_a3_a"))|>
left_join(replace_miss_dischdate_0c_b3_a[, c("rn_1", "disch_date_num_1_rec")], by=c("rn"="rn_1"), suffix = c("_b2", "_b3_a"))|>
mutate(disch_date_num_rec= case_when(!is.na(disch_date_num_2_rec_a2 )~ disch_date_num_2_rec_a2, !is.na(disch_date_num_1_rec_b2)~ disch_date_num_1_rec_b2, !is.na(disch_date_num_2_rec_a3_a)~ disch_date_num_2_rec_a3_a, !is.na(disch_date_num_1_rec_b3_a)~ disch_date_num_1_rec_b3_a, T~ disch_date_rec0_num)) |>
mutate(tr_compliance_rec= case_when(!is.na(disch_date_num_2_rec_a2)~ NA_character_, !is.na(disch_date_num_1_rec_b2)~ NA_character_, !is.na(disch_date_num_2_rec_a3_a)~ NA_character_, !is.na(disch_date_num_1_rec_b3_a)~ NA_character_, T~ tr_compliance_rec))|>
mutate(dit_rec2= disch_date_num_rec-adm_date_rec_num)|>
mutate(tr_compliance_rec= case_when(!is.na(disch_date_num_2_rec_a2)~ NA_character_, !is.na(disch_date_num_1_rec_b2)~ NA_character_, !is.na(disch_date_num_2_rec_a3_a)~ NA_character_, !is.na(disch_date_num_1_rec_b3_a)~ NA_character_, T~ tr_compliance_rec))|>
select(-disch_date_num_2_rec_a2, -disch_date_num_1_rec_b2, -disch_date_num_2_rec_a3_a, -disch_date_num_1_rec_b3_a)|>
(\(df) {
cat(paste0("4. Database after correcting overlapping with missing discharge dates, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Database after correcting overlapping with missing discharge dates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1h))stop("Error: Added treatment episodes in the process")
df
})()
# 4. Database before correcting overlapping with missing discharge dates, obs.: 150,187
# 4. Database before correcting overlapping with missing discharge dates, RUNs: 106,283
# 4. Database after correcting overlapping with missing discharge dates, obs.: 150,182
# 4. Database after correcting overlapping with missing discharge dates, RUNs: 106,283 4. Database before correcting overlapping with missing discharge dates, obs.: 150,187
4. Database before correcting overlapping with missing discharge dates, RUNs: 106,283
4. Database after correcting overlapping with missing discharge dates, obs.: 150,182
4. Database after correcting overlapping with missing discharge dates, RUNs: 106,283
The database SISTRAT23_c1_2010_2022_df_prev1i includes overlapping correction to account for missing discharge dates. We replaced the dates in disch_date_num_rec, dit_rec2 accounting for the new discharge date and tr_compliance_rec were replaced with missing values for the cases that had missing discharge dates with replaced values.
0.b.2 After replacement for missing dates of discharge
Code
CONS_C1_df_dup_intervals_after_miss<-
SISTRAT23_c1_2010_2022_df_prev1i|>
mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec), 19475, disch_date_num_rec))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
rename("hash_key_2"="hash_key", "rn2"="rn")|>
select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec2, id_centro, tr_compliance_rec, plan_type, senda)|>
#dplyr::filter(motivodeegreso!="Derivación")|>
data.table::as.data.table()
overlap_dates_C1_after_miss <- janitor::clean_names(
sqldf::sqldf(
"
SELECT *
FROM CONS_C1_df_dup_intervals_after_miss AS x
INNER JOIN CONS_C1_df_dup_intervals_after_miss AS y
ON x.hash_key_2 = y.hash_key_2
AND x.rn2 < y.rn2 -- Avoids duplicates (eg.: x vs y and then y vs x)
AND x.adm_date_rec_num < y.disch_date_num_miss -- x Admitted before being admitted into another treatment
AND x.disch_date_num_miss > y.adm_date_rec_num -- x Discharged after being admitted in other
"
)) |>
`colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1", "rn_2",
"hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2"))
cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss)),"\n")
cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss, hash_key_1))))
#Number of overlapped dates, observations: 1546 june 2025; 1554, 1518, 1579
#Number of overlapped dates, RUNs: 1405 june 2025; 1412, 1385, 1411
#The rows on the left originate from older databases.
CONS_C1_df_dup_overlaps_COMP_after_miss <-
as_tidytable(overlap_dates_C1_after_miss)|>
mutate(pair_id= paste0(rn_1,"_",rn_2))|>
mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no", T~NA_character_))|>
mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
select(-hash_key_2) |>
rename("hash_key"="hash_key_1")
####
CONS_C1_df_dup_overlaps_COMP_after_miss|>
(\(df) {
rio::export(df, "_out/_overlaps_dup_step_2_after_miss.xlsx") #for visual comparison in excel
knitr::kable(filter(df, hash_key %in% pull(sample_n_with_seed(CONS_C1_df_dup_overlaps_COMP_after_miss,20, seed=2125),"hash_key"))|> mutate(hash_key= as.numeric(factor(hash_key))), format = "html", format.args = list(decimal.mark = ".", big.mark = ","), caption="Cases with overlapped treatment ranges (after correcting for missing discharge dates)", align = rep('c', 32),
#col.names = c("Row No.(1)", "HASH", "Year of\nDataset(1)", "Admission age(1)", "Admission\ndate(1)(a)", "Admission\ndate(1)(b)", "Discharge\ndate(1)(a)", "Discharge\ndate(1)(b)", "Treatment Days(1)", "Center ID(1)", "Cause of\nDischarge(1)", "Plan Type(1)", "SENDA(1)", "Row No.(2)", "Year of\nDataset(2)", "Admission age(2)", "Admission\ndate(2)(a)", "Admission\ndate(2)(b)", "Discharge\ndate(2)(a)", "Discharge\ndate(2)(b)", "Treatment Days(2)", "Center ID(2)", "Cause of\nDischarge(2)", "Plan Type(2)", "SENDA(2)", "Same Center ID", "Earlier Dataset \nof 2nd Treatment", "Financed \nBy SENDA", "Referral", "Days Overlapped", "2nd treatment has more treatment days", "1st treatment\nabsorbs 2nd")
) |>
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"),font_size = 8)|>
kableExtra::add_footnote( c("Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case;", "a= date; b= numeric", "Same Center ID= If both cases share the same Center ID", "Financed By SENDA= If both cases are financed by SENDA;", "Referral= If the cause of discharge is the referral from another center (1= Referral);","Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment","2nd treatment has more treatment days= Earlier treatment has more days of treatment"), notation = "none")|>
kableExtra::scroll_box(width = "100%", height = "375px")
})()Number of overlapped dates, observations: 1546
Number of overlapped dates, RUNs: 1405
| rn_1 | hash_key | ano_bd_1 | adm_age_1 | adm_date_1 | adm_date_rec_num_1 | disch_date_1 | disch_date_num_1 | dit_1 | id_centro_1 | tr_compliance_1 | plan_type_1 | senda_1 | rn_2 | ano_bd_2 | adm_age_2 | adm_date_2 | adm_date_rec_num_2 | disch_date_2 | disch_date_num_2 | dit_2 | id_centro_2 | tr_compliance_2 | plan_type_2 | senda_2 | pair_id | same_id | bd_2_earlier | senda_status | referral | days_overlapped | more_dit | trat_1_within_2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 49,390 | 1 | 2014 | 35.81 | 2013-06-17 | 15,873 | 2014-01-27 | 16,097 | 224 | 166 | late dropout | pg-pab | si | 53,962 | 2014 | 36.35 | 2013-12-30 | 16,069 | 2014-01-03 | 16,073 | 4 | 163 | early dropout | m-pr | si | 49390_53962 | 0 | 0 | both yes | 0 | 28 | 0 | 0 |
| 78,289 | 2 | 2015 | 35.03 | 2015-06-18 | 16,604 | 2015-07-28 | 16,644 | 40 | 141 | referral | pg-pai | si | 80,895 | 2015 | 35.02 | 2015-06-12 | 16,598 | 2015-11-26 | 16,765 | 167 | 141 | late dropout | pg-pai | si | 78289_80895 | 1 | 0 | both yes | 1 | 46 | 1 | 1 |
| 79,294 | 2 | 2015 | 35.14 | 2015-07-28 | 16,644 | 2015-08-10 | 16,657 | 13 | 142 | referral | m-pr | si | 80,895 | 2015 | 35.02 | 2015-06-12 | 16,598 | 2015-11-26 | 16,765 | 167 | 141 | late dropout | pg-pai | si | 79294_80895 | 0 | 0 | both yes | 1 | 59 | 1 | 1 |
| 122,569 | 3 | 2017 | 37.58 | 2017-09-13 | 17,422 | 2018-01-23 | 17,554 | 132 | 132 | referral | pg-pr | si | 133,197 | 2018 | 37.89 | 2018-01-02 | 17,533 | 2018-01-31 | 17,562 | 29 | 726 | referral | pg-pr | si | 122569_133197 | 0 | 1 | both yes | 1 | 21 | 0 | 0 |
| 38,600 | 4 | 2013 | 25.30 | 2013-04-04 | 15,799 | 2013-04-23 | 15,818 | 19 | 238 | referral | pg-pai | si | 38,810 | 2013 | 25.34 | 2013-04-22 | 15,817 | 2013-11-11 | 16,020 | 203 | 258 | late dropout | pg-pr | si | 38600_38810 | 0 | 0 | both yes | 1 | 1 | 1 | 0 |
| 124,540 | 5 | 2017 | 28.46 | 2017-11-24 | 17,494 | 2018-01-23 | 17,554 | 60 | 132 | referral | pg-pr | si | 133,182 | 2018 | 28.57 | 2018-01-02 | 17,533 | 2018-07-06 | 17,718 | 185 | 726 | completion | pg-pr | si | 124540_133182 | 0 | 1 | both yes | 1 | 21 | 1 | 0 |
| 132,639 | 6 | 2018 | 64.13 | 2017-11-14 | 17,484 | 2018-03-29 | 17,619 | 135 | 291 | referral | pg-pai | si | 135,843 | 2018 | 64.47 | 2018-03-20 | 17,610 | 2019-01-31 | 17,927 | 317 | 303 | late adm discharge | pg-pr | si | 132639_135843 | 0 | 0 | both yes | 1 | 9 | 1 | 0 |
| 126,440 | 7 | 2018 | 28.25 | 2016-12-12 | 17,147 | 2018-07-04 | 17,716 | 569 | 465 | completion | pg-pab | si | 127,111 | 2018 | 28.48 | 2017-03-06 | 17,231 | 2018-10-08 | 17,812 | 581 | 263 | referral | m-pai | si | 126440_127111 | 0 | 0 | both yes | 0 | 485 | 1 | 0 |
| 109,974 | 8 | 2017 | 34.07 | 2016-10-04 | 17,078 | 2017-03-24 | 17,249 | 171 | 232 | referral | pg-pai | si | 113,052 | 2017 | 34.41 | 2017-02-07 | 17,204 | 2017-06-12 | 17,329 | 125 | 234 | completion | m-pr | no | 109974_113052 | 0 | 0 | 1 | 45 | 0 | 0 | |
| 75,459 | 9 | 2015 | 22.99 | 2015-04-27 | 16,552 | 2015-09-08 | 16,686 | 134 | 166 | late dropout | pg-pai | si | 89,203 | 2016 | 23.35 | 2015-09-04 | 16,682 | 2016-02-23 | 16,854 | 172 | 163 | late dropout | pg-pr | si | 75459_89203 | 0 | 1 | both yes | 0 | 4 | 1 | 0 |
| 58,294 | 10 | 2014 | 23.99 | 2014-05-19 | 16,209 | 2014-10-29 | 16,372 | 163 | 143 | late dropout | pg-pai | si | 69,404 | 2015 | 24.39 | 2014-10-14 | 16,357 | 2019-12-31 | 18,261 | 1,904 | 364 | adm truncated | pg-pai | si | 58294_69404 | 0 | 1 | both yes | 0 | 15 | 1 | 0 |
| 36,916 | 11 | 2013 | 35.16 | 2013-02-13 | 15,749 | 2013-05-23 | 15,848 | 99 | 287 | referral | pg-pai | si | 65,608 | 2015 | 35.39 | 2013-05-08 | 15,833 | 2016-01-05 | 16,805 | 972 | 295 | referral | pg-pai | si | 36916_65608 | 0 | 1 | both yes | 1 | 15 | 1 | 0 |
| 23,744 | 12 | 2012 | 22.54 | 2012-01-02 | 15,341 | 2012-02-16 | 15,386 | 45 | 204 | referral | pg-pab | si | 24,662 | 2012 | 22.66 | 2012-02-14 | 15,384 | 2012-07-17 | 15,538 | 154 | 215 | late dropout | pg-pr | si | 23744_24662 | 0 | 0 | both yes | 1 | 2 | 1 | 0 |
| 200,444 | 13 | 2021 | 33.75 | 2021-07-26 | 18,834 | 2021-09-30 | 18,900 | 66 | 259 | referral | pg-pai | si | 209,379 | 2022 | 33.89 | 2021-09-14 | 18,884 | 2022-02-07 | 19,030 | 146 | 436 | referral | m-pai | si | 200444_209379 | 0 | 1 | both yes | 1 | 16 | 1 | 0 |
| 21,449 | 14 | 2012 | 39.12 | 2011-05-02 | 15,096 | 2012-05-02 | 15,462 | 366 | 337 | referral | pg-pai | si | 28,085 | 2012 | 40.10 | 2012-04-25 | 15,455 | 2012-07-03 | 15,524 | 69 | 258 | early adm discharge | pg-pr | si | 21449_28085 | 0 | 0 | both yes | 1 | 7 | 0 | 0 |
| 64,179 | 15 | 2014 | 37.82 | 2014-11-03 | 16,377 | 2015-01-27 | 16,462 | 85 | 141 | completion | pg-pab | si | 71,274 | 2015 | 38.00 | 2015-01-07 | 16,442 | 2015-12-17 | 16,786 | 344 | 141 | completion | pg-pr | no | 64179_71274 | 1 | 1 | 0 | 20 | 1 | 0 | |
| 10,433 | 16 | 2011 | 46.26 | 2010-05-19 | 14,748 | 2011-07-20 | 15,175 | 427 | 104 | late dropout | m-pr | no | 17,877 | 2011 | 47.43 | 2011-07-18 | 15,173 | 2012-01-31 | 15,370 | 197 | 331 | late dropout | m-pai | si | 10433_17877 | 0 | 0 | second yes | 0 | 2 | 0 | 0 |
| 24,667 | 17 | 2012 | 31.20 | 2012-02-15 | 15,385 | 2012-05-18 | 15,478 | 93 | 161 | referral | m-pai | si | 27,109 | 2012 | 31.45 | 2012-05-17 | 15,477 | 2012-10-03 | 15,616 | 139 | 275 | late dropout | m-pr | si | 24667_27109 | 0 | 0 | both yes | 1 | 1 | 1 | 0 |
| 27,109 | 17 | 2012 | 31.45 | 2012-05-17 | 15,477 | 2012-10-03 | 15,616 | 139 | 275 | late dropout | m-pr | si | 48,001 | 2014 | 31.83 | 2012-10-02 | 15,615 | 2015-02-16 | 16,482 | 867 | 161 | late dropout | m-pai | si | 27109_48001 | 0 | 1 | both yes | 0 | 1 | 1 | 0 |
| 31,236 | 18 | 2012 | 25.11 | 2012-11-05 | 15,649 | 2013-01-01 | 15,706 | 57 | 206 | referral | pg-pab | si | 36,078 | 2013 | 25.24 | 2012-12-21 | 15,695 | 2013-02-27 | 15,763 | 68 | 205 | referral | pg-pai | si | 31236_36078 | 0 | 1 | both yes | 1 | 11 | 1 | 0 |
| 58,436 | 19 | 2014 | 34.43 | 2014-06-04 | 16,225 | 2014-06-23 | 16,244 | 19 | 432 | early dropout | m-pr | si | 67,549 | 2015 | 34.43 | 2014-06-02 | 16,223 | 2015-02-11 | 16,477 | 254 | 291 | completion | pg-pai | si | 58436_67549 | 0 | 1 | both yes | 0 | 21 | 1 | 1 |
| 17,323 | 20 | 2011 | 48.68 | 2011-06-23 | 15,148 | 2011-09-26 | 15,243 | 95 | 232 | referral | pg-pai | si | 18,975 | 2011 | 48.91 | 2011-09-13 | 15,230 | 2011-12-14 | 15,322 | 92 | 246 | late dropout | pg-pai | si | 17323_18975 | 0 | 0 | both yes | 1 | 13 | 0 | 0 |
| Note.Each row represents an overlap. Variables ending with '_1' are the first case, and variables ending with '_2' correspond to the second case; | ||||||||||||||||||||||||||||||||
| a= date; b= numeric | ||||||||||||||||||||||||||||||||
| Same Center ID= If both cases share the same Center ID | ||||||||||||||||||||||||||||||||
| Financed By SENDA= If both cases are financed by SENDA; | ||||||||||||||||||||||||||||||||
| Referral= If the cause of discharge is the referral from another center (1= Referral); | ||||||||||||||||||||||||||||||||
| Days Overlapped= Difference between the date of admission of the earlier treatment, and the date of discharge of the latter treatment | ||||||||||||||||||||||||||||||||
| 2nd treatment has more treatment days= Earlier treatment has more days of treatment |
We then proceeded to correct the overlapping cases with missing discharge dates.
Four alternatives were delimited in order to resolve overlapped dates:
- Impute treated days and replace the date of discharge
- Keep the earliest treatment
- Discard the earliest treatment
- Subtract days to the date of discharge of the last treatment
0.b.4 Overlappings <= 30 days
Meanwhile, we focus in cases with overlap of less than 30 days.
Code
replace_disch_dates_0a<-
CONS_C1_df_dup_overlaps_COMP_after_miss|>
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
filter(days_overlapped<=30)|>
(\(df) {
cat(paste0("4. Overlapping <= 30 days, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Overlapping <= 30 days, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>#1,019 #1002 # 1,022 1,005; 1,021 1,004 june 2025
mutate(OBS= case_when(adm_date_rec_num_1>adm_date_rec_num_2~ "4.3.0a.<=30 days overlapping, replaced w date of discharge of last treatment minus 1(first)", adm_date_rec_num_1< adm_date_rec_num_2~ "4.3.0a.<=30 days overlapping, replaced w date of discharge of last treatment minus 1(second)",T~NA_character_))|>
mutate(disch_date_num_2_rec= case_when(adm_date_rec_num_1>adm_date_rec_num_2~ adm_date_rec_num_1-1, T~NA_real_))|>
mutate(disch_date_num_1_rec= case_when(adm_date_rec_num_2>adm_date_rec_num_1~ adm_date_rec_num_2-1, T~NA_real_))4. Overlapping <= 30 days, cases: 1,021
4. Overlapping <= 30 days, RUNs: 1,004
We apply the decision tree to the overlapping cases with less than 30 days of overlap.
Code
#replace_disch_dates_0a[, c("rn_1", "disch_date_num_2_rec", "OBS")]
SISTRAT23_c1_2010_2022_df_prev1j<-
SISTRAT23_c1_2010_2022_df_prev1i|>
(\(df) {
cat(paste0("4. Database before correcting overlapping with <= 30 days of overlapping, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Database before correcting overlapping with <= 30 days of overlapping, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
left_join(subset(replace_disch_dates_0a[, c("rn_2", "disch_date_num_2_rec", "OBS")], !is.na(disch_date_num_2_rec)), by=c("rn"="rn_2"), suffix= c("","_0a1"), first=T)|>
left_join(subset(replace_disch_dates_0a[, c("rn_1", "disch_date_num_1_rec", "OBS")], !is.na(disch_date_num_1_rec)), by=c("rn"="rn_1"), suffix= c("","_0a2"), first=T)|>
mutate(OBS= case_when(!is.na(OBS_0a1)~paste0(as.character(OBS),";",OBS_0a1), T~ OBS))|>
mutate(OBS= case_when(!is.na(OBS_0a2)~paste0(as.character(OBS),";",OBS_0a2), T~ OBS))|>
mutate(disch_date_num_rec2= case_when(!is.na(disch_date_num_2_rec)~ disch_date_num_2_rec, !is.na(disch_date_num_1_rec)~ disch_date_num_1_rec, T~ disch_date_num_rec))|>
mutate(dit_rec3= disch_date_num_rec2- adm_date_rec_num)|>
select(-disch_date_num_2_rec, -disch_date_num_1_rec, -OBS_0a1, -OBS_0a2)|>
(\(df) {
cat(paste0("4. Database after correcting overlapping with <= 30 days of overlapping, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1i))stop("Error: Added treatment episodes in the process")
df
})()
# 4. Database before correcting overlapping with <= 30 days of overlapping, cases: 150,182
# 4. Database before correcting overlapping with <= 30 days of overlapping, RUNs: 106,283
# 4. Database after correcting overlapping with <= 30 days of overlapping, cases: 150,182
# 4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: 106,2834. Database before correcting overlapping with <= 30 days of overlapping, cases: 150,182
4. Database before correcting overlapping with <= 30 days of overlapping, RUNs: 106,283
4. Database after correcting overlapping with <= 30 days of overlapping, cases: 150,182
4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: 106,283
We obtained the database SISTRAT23_c1_2010_2022_df_prev1j. We corrected the dates in disch_date_num_rec2, dit_rec3 to account for the new discharge date.
0.b.5 Treatment episode without a single day in treatment
We apply the detection of duplicates again and scenarios
Code
CONS_C1_df_dup_intervals_after_miss_less30d<-
SISTRAT23_c1_2010_2022_df_prev1j|>
mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec2), 19475, disch_date_num_rec2))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
rename("hash_key_2"="hash_key", "rn2"="rn")|>
select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec3, id_centro, tr_compliance_rec, plan_type, senda)|>
#dplyr::filter(motivodeegreso!="Derivación")|>
data.table::as.data.table()
overlap_dates_C1_after_miss_less30d <- janitor::clean_names(
sqldf::sqldf(
"
SELECT *
FROM CONS_C1_df_dup_intervals_after_miss_less30d AS x
INNER JOIN CONS_C1_df_dup_intervals_after_miss_less30d AS y
ON x.hash_key_2 = y.hash_key_2
AND x.rn2 < y.rn2 -- Avoids duplicates (eg.: x vs y and then y vs x)
AND x.adm_date_rec_num < y.disch_date_num_miss -- x Admitted before being admitted into another treatment
AND x.disch_date_num_miss > y.adm_date_rec_num -- x Discharged after being admitted in other
"
)) |>
`colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1", "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2"))
cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss_less30d)),"\n")
cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss_less30d, hash_key_1))))
#Number of overlapped dates, observations: 536 ; 560 ; 532 ; 525 june 2025
#Number of overlapped dates, RUNs: 412 ; 420 ; 418 ; 412 june 2025
#The rows on the left originate from older databases.
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d <-
as_tidytable(overlap_dates_C1_after_miss_less30d)|>
mutate(pair_id= paste0(rn_1,"_",rn_2))|>
mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no", T~NA_character_))|>
mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
select(-hash_key_2) |>
rename("hash_key"="hash_key_1")Number of overlapped dates, observations: 525
Number of overlapped dates, RUNs: 412
We apply the scenarios to the main database, discarding cases with less than one day in treatment.
Code
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d |>
filter(dit_2<1| dit_2<1)|>
(\(df) {
cat(paste0("4. Less than one day in treatment, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Less than one day in treatment, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
})()
# 4. Less than one day in treatment, cases: 1
# 4. Less than one day in treatment, RUNs: 1
discard_0c_0b<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d |>
filter(dit_2<1| dit_2<1)|>
mutate(rn= case_when(dit_1<1~ rn_1, dit_2<1~ rn_2, T~NA_real_)) |>
pull(rn)
keep_0c_0b<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d |>
filter(dit_2<1| dit_2<1)|>
mutate(rn= case_when(dit_1>0~ rn_1, dit_2>0~ rn_2, T~NA_real_)) |>
pull(rn)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
SISTRAT23_c1_2010_2022_df_prev1k<-
SISTRAT23_c1_2010_2022_df_prev1j|>
(\(df) {
cat(paste0("4. Database before discarding cases with less than one day in treatment, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Database before discarding cases with less than one day in treatment, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
filter(!(rn %in% discard_0c_0b))|>
mutate(OBS= case_when(rn %in% keep_0c_0b~ paste0(as.character(OBS),";","4.3.0b.Discard treatment episode with no days in treatment"), T~ OBS))|>
(\(df) {
cat(paste0("4. Database after correcting overlapping with <= 30 days of overlapping, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1j))stop("Error: Added treatment episodes in the process")
df
})()
# 4. Database before discarding cases with less than one day in treatment, cases: 150,182
# 4. Database before discarding cases with less than one day in treatment, RUNs: 106,283
# 4. Database after correcting overlapping with <= 30 days of overlapping, cases: 150,181
# 4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: 106,283 4. Less than one day in treatment, cases: 1
4. Less than one day in treatment, RUNs: 1
4. Database before discarding cases with less than one day in treatment, cases: 150,182
4. Database before discarding cases with less than one day in treatment, RUNs: 106,283
4. Database after correcting overlapping with <= 30 days of overlapping, cases: 150,181
4. Database after correcting overlapping with <= 30 days of overlapping, RUNs: 106,283
The new database is called SISTRAT23_c1_2010_2022_df_prev1k.
0.b.6 Citeria based on sharing center ID, SENDA financing status, treatment length and referral discharge
We apply the detection of duplicates again and scenarios
Code
CONS_C1_df_dup_intervals_after_miss_less30d_0d<-
SISTRAT23_c1_2010_2022_df_prev1k|>
mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec2), 19475, disch_date_num_rec2))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
rename("hash_key_2"="hash_key", "rn2"="rn")|>
select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec3, id_centro, tr_compliance_rec, plan_type, senda)|>
#dplyr::filter(motivodeegreso!="Derivación")|>
data.table::as.data.table()
overlap_dates_C1_after_miss_less30d_0d <- janitor::clean_names(
sqldf::sqldf(
"
SELECT *
FROM CONS_C1_df_dup_intervals_after_miss_less30d_0d AS x
INNER JOIN CONS_C1_df_dup_intervals_after_miss_less30d_0d AS y
ON x.hash_key_2 = y.hash_key_2
AND x.rn2 < y.rn2 -- Avoids duplicates (eg.: x vs y and then y vs x)
AND x.adm_date_rec_num < y.disch_date_num_miss -- x Admitted before being admitted into another treatment
AND x.disch_date_num_miss > y.adm_date_rec_num -- x Discharged after being admitted in other
"
)) |>
`colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1", "rn_2",
"hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2"))
cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss_less30d_0d)),"\n")
cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss_less30d_0d, hash_key_1))))
#Number of overlapped dates, observations: 559 ; 524 june 2025
#Number of overlapped dates, RUNs: 419 ; 411 june 2025
#The rows on the left originate from older databases.
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d <-
as_tidytable(overlap_dates_C1_after_miss_less30d_0d)|>
mutate(pair_id= paste0(rn_1,"_",rn_2))|>
mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no", T~NA_character_))|>
mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
mutate(trat_2_within_1=ifelse(disch_date_num_2<disch_date_num_1 & adm_date_rec_num_2>adm_date_rec_num_1,1,0))|>
select(-hash_key_2) |>
rename("hash_key"="hash_key_1")Number of overlapped dates, observations: 524
Number of overlapped dates, RUNs: 411
In 2020, we followed this rules to discard overlapping cases. Now, we are trying to apply them sequentially rather than all at once.
Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
bpmn::bpmn(paste0(wdpath,"cons/_input/overlapped_ranges_decision_tree.bpmn"))Decision Tree for the Discard of Overlapping Dates in Cases
If we check the IDs of the centers with the most overlapping treatment days, we can see some sort of pattern. We think that this could be related to small changes in treatment modality or setting within treatment centers, similar to internal referrals. Sometimes can be related to the change of centers due to termination of agreements with SENDA.
Code
SISTRAT23_c1_2010_2022_df_prev1k|>
select(id_centro, nombre_centro_rec) |>
filter(id_centro %in% attr(rev(sort(table(c(CONS_C1_df_dup_overlaps_COMP_after_miss_less30d$id_centro_1, CONS_C1_df_dup_overlaps_COMP_after_miss_less30d$id_centro_2)))),"names")[1:20]) |>
distinct(id_centro, .keep_all=T) |>
knitr::kable("markdown", caption= "Most frequent treatment centers with overlapped treatment dates")| id_centro | nombre_centro_rec |
|---|---|
| 148 | cosam quilicura |
| 238 | cosam la pintana |
| 118 | cosam lota |
| 294 | cosam talagante |
| 142 | centro de trat. y rehab. para personas con consumo perjudicial o dependencia a alcohol y/o drogas colina (ct. colina pr) |
| 166 | cosam enrique paris |
| 291 | cosam melipilla |
| 502 | centro de responsabilidad de salud mental del complejo asistencial dr.victor rios ruiz |
| 161 | centro comunitario de salud mental familiar (cosam pudahuel) |
| 123 | cosam newen |
| 122 | hospital de tome, centro superarte |
| 295 | crs salvador allende |
| 109 | cosam concepcion |
| 117 | comunidad terapeutica villamavida |
| 146 | cosam lampa |
| 147 | comunidad terapeutica manresa |
| 141 | cosam colina |
| 106 | cosam nuble (cadem de chillan) |
| 136 | consultorio alejandro gutierrez |
| 328 | cosam alenmoguen |
Hence, treatment center might be an important criteria to judge overlappings.
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("Same center ID, Both tr. SENDA Yes/No, An episode in the middle of the other")
invisible("Yes")
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |>
#discard multiple treatments
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
#same CENTER ID
filter(same_id==1)|>
#both financed/not financed by SENDA
filter(grepl("both",senda_status))|>
#tr. in the middle of another
filter(trat_1_within_2==1|trat_2_within_1==1)|>
#select the row of the treatment to discard and the row of the treatment to keep
mutate(rn_disc= case_when(trat_2_within_1==1~ rn_1, trat_1_within_2==1~ rn_2),
rn_keep= case_when(trat_1_within_2==1~ rn_1, trat_2_within_1==1~ rn_2)) |>
(\(df) {
cat(paste0("4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, cases: ", formatC(nrow(df)*2, big.mark=",")),"\n")
cat(paste0("4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
pull(df, rn_disc) ->> row_3a_discard_1st_tr
pull(df, rn_keep) ->> row_3a_keep_2nd_tr
})()
# 4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, cases: 114 ; 116 june 2025
# 4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, RUNs: 57 ; 58 june 2025
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Same center ID; Both treatments are SENDA Yes or No= No; oldest episode should be modified")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
mod_4ab<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |>
#discard multiple treatments
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
#same CENTER ID
filter(same_id==1)|>
#both financed/not financed by SENDA
filter(grepl("both",senda_status))|>
#no tr. in the middle of another
filter(trat_1_within_2==0 & trat_2_within_1==0)|>
# referral in the cause of discharge of the treatment with latter discharge date (oldest)
mutate(crit_4ab= case_when(adm_date_rec_num_2< adm_date_rec_num_1 & tr_compliance_2=="referral"~ "4a.1. subtract days to second episode", adm_date_rec_num_1< adm_date_rec_num_2 & tr_compliance_1=="referral"~ "4a.2. subtract days to first episode", adm_date_rec_num_2<adm_date_rec_num_1 & tr_compliance_2!="referral"~ "4b.1. change cause of discharge and subtract days to second episode", adm_date_rec_num_1<adm_date_rec_num_2 & tr_compliance_1!="referral"~ "4b.2. change cause of discharge and subtract days to first episode", T~NA_character_ ))|>
# which is the oldest
mutate(oldest= case_when(adm_date_rec_num_1<adm_date_rec_num_2~"oldest_1", T~ "oldest_2")) |>
#select the row of the treatment to discard and the row of the treatment to keep
#4a) change amount of days treated
mutate(disch_date_num_rec_2= case_when(grepl("\\.1\\.", crit_4ab)~ adm_date_rec_num_1-1, T~disch_date_num_2),
disch_date_num_rec_1= case_when(grepl("\\.2\\.", crit_4ab)~ adm_date_rec_num_2-1, T~disch_date_num_1))|>
#4b) also change cause of discharge,now only for those with another cause of discharge
mutate(tr_compliance_rec_2= case_when(grepl("4b\\.1", crit_4ab)~ "referral", T~tr_compliance_2),
tr_compliance_rec_1= case_when(grepl("4b\\.2", crit_4ab)~ "referral", T~tr_compliance_1))|>
mutate(rn_mod_1= case_when(grepl("\\.1\\.", crit_4ab)~ rn_1, grepl("\\.2\\.", crit_4ab)~ rn_2))|>
(\(df) {
cat(paste0("4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: ", formatC(nrow(subset(df, grepl("^4a\\.1", crit_4ab)))*2, big.mark=",")),"\n")
cat(paste0("4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^4a\\.1", crit_4ab)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: ", formatC(nrow(subset(df, grepl("^4a\\.2", crit_4ab)))*2, big.mark=",")),"\n")
cat(paste0("4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^4a\\.2", crit_4ab)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: ", formatC(nrow(subset(df,grepl("^4b\\.1", crit_4ab)))*2, big.mark=",")),"\n")
cat(paste0("4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^4b\\.1", crit_4ab)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: ", formatC(nrow(subset(df,grepl("^4b\\.2", crit_4ab)))*2, big.mark=",")),"\n")
cat(paste0("4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^4b\\.2", crit_4ab)), hash_key)), big.mark=",")),"\n")
df
})() |>
select(hash_key, rn_1, rn_2, oldest, disch_date_num_rec_1, disch_date_num_rec_2, tr_compliance_rec_1, tr_compliance_rec_2, crit_4ab)
# 4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2
# 4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1
# 4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 38
# 4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 19
# 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 4
# 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 2
# 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 74
# 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 37
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("2b-Same center ID, Different SENDA financing status")
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |>
#discard multiple treatments
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
#same CENTER ID
filter(same_id==1)|>
#Different financing status
filter(!grepl("both",senda_status))|>
#select the row of the earliest treatment and check if it is financed by SENDA
mutate(crit_2b12= case_when(adm_date_rec_num_2>adm_date_rec_num_1 & senda_2=="si"~ "2b1.1. earliest treatment (2nd) financed by SENDA", adm_date_rec_num_1>adm_date_rec_num_2 & senda_1=="si"~ "2b1.2. earliest treatment (1st) financed by SENDA", adm_date_rec_num_2>adm_date_rec_num_1 & senda_2=="no"~ "2b2.1. earliest treatment (2nd) not financed by SENDA", adm_date_rec_num_1>adm_date_rec_num_2 & senda_1=="no"~ "2b2.2. earliest treatment (1st) not financed by SENDA", T~NA_character_ ))|> # janitor::tabyl(crit_2b12)
#select the row of the treatment to discard and the row of the treatment to keep
mutate(rn_keep_2b1= case_when(grepl("^2b1\\.1", crit_2b12)~ rn_2, grepl("^2b1\\.2", crit_2b12)~ rn_1, T~NA_real_)) |>
mutate(rn_disc_2b1= case_when(grepl("^2b1\\.1", crit_2b12)~ rn_1, grepl("^2b1\\.2", crit_2b12)~ rn_2, T~NA_real_)) |>
mutate(rn_keep_2b2= case_when(grepl("^2b2\\.1", crit_2b12)~ rn_1, grepl("^2b2\\.2", crit_2b12)~ rn_2, T~NA_real_)) |>
mutate(rn_disc_2b2= case_when(grepl("^2b2\\.1", crit_2b12)~ rn_2, grepl("^2b2\\.2", crit_2b12)~ rn_1, T~NA_real_)) |>
(\(df) {
cat(paste0("4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), cases: ", formatC(nrow(subset(df, grepl("^2b1\\.1", crit_2b12)))*2, big.mark=",")),"\n")
cat(paste0("4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2b1\\.1", crit_2b12)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), cases: ", formatC(nrow(subset(df, grepl("^2b1\\.2", crit_2b12)))*2, big.mark=",")),"\n")
cat(paste0("4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2b1\\.2", crit_2b12)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), cases: ", formatC(nrow(subset(df,grepl("^2b2\\.1", crit_2b12)))*2, big.mark=",")),"\n")
cat(paste0("4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2b2\\.1", crit_2b12)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), cases: ", formatC(nrow(subset(df,grepl("^2b2\\.2", crit_2b12)))*2, big.mark=",")),"\n")
cat(paste0("4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2b2\\.2", crit_2b12)), hash_key)), big.mark=",")),"\n")
pull(df, rn_keep_2b2) ->> row_2b2_discard_1st_tr
pull(df, rn_keep_2b2) ->> row_2b2_keep_2nd_tr
pull(df, rn_disc_2b1) ->> row_2b1_discard_1st_tr
pull(df, rn_keep_2b1) ->> row_2b1_keep_2nd_tr
})()
# 4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), cases: 4
# 4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), RUNs: 2
# 4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), cases: 8
# 4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), RUNs: 4
# 4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), cases: 14
# 4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 7
# 4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), cases: 8
# 4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), RUNs: 4
# at june 2025
# 4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), cases: 4
# 4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), RUNs: 2
# 4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), cases: 8
# 4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), RUNs: 4
# 4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), cases: 16
# 4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 8
# 4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), cases: 10
# 4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), RUNs: 5
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("35ab- Different center IDs; same SENDA financing statuses (Yes/No); one episode in the middle of the other; and the earliest treatment comes from a more recent yearly database")
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |>
#discard multiple treatments
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
#different CENTER ID
filter(same_id!=1)|>
#Same financing status
filter(grepl("both",senda_status))|>
#tr. in the middle of another
filter(trat_1_within_2==1|trat_2_within_1==1)|>
#earliest treatment comes from a more recent yearly database
#select the row of the treatment to discard and the row of the treatment to keep
#filter(case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2~T,T~F)) |> View()
#Treatments with retrieval dates earlier within a treatment might be strange
mutate(rn_disc_35a= case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2~ rn_2, trat_1_within_2==1 & ano_bd_2> ano_bd_1~ rn_1, T~NA_real_))|>
mutate(rn_disc_35b= case_when(trat_2_within_1==1 & ano_bd_1<= ano_bd_2~ rn_2, trat_1_within_2==1 & ano_bd_2<= ano_bd_1~ rn_1, T~NA_real_))|>
mutate(rn_keep_35a= case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2~ rn_1, trat_1_within_2==1 & ano_bd_2> ano_bd_1~ rn_2, T~NA_real_))|>
mutate(rn_keep_35b= case_when(trat_2_within_1==1 & ano_bd_1<= ano_bd_2~ rn_1, trat_1_within_2==1 & ano_bd_2<= ano_bd_1~ rn_2, T~NA_real_))|>
(\(df) {
pull(df, rn_disc_35a) ->> row_35a_disc_check_after
pull(df, rn_keep_35a) ->> row_35a_keep_check_after
pull(df, rn_disc_35b) ->> row_35b_discard_shortest
pull(df, rn_keep_35b) ->> row_35b_keep_largest
})()
cat(paste0("4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, cases: ", formatC(length(row_35a_keep_check_after[!is.na(row_35a_keep_check_after)])*2, big.mark=",")),"\n")
cat(paste0("4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, RUNs: ", formatC(length(row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]), big.mark=",")),"\n")
cat(paste0("4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, cases: ", formatC(length(row_35b_keep_largest[!is.na(row_35b_keep_largest)])*2, big.mark=",")),"\n")
cat(paste0("4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, RUNs: ", formatC(length(row_35b_keep_largest[!is.na(row_35b_keep_largest)]), big.mark=",")),"\n")
# june 2025
# 4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, cases: 68
# 4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, RUNs: 34
# 4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, cases: 54
# 4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, RUNs: 27
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat(paste0("35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, tr. length > 3 yrs., cases: "))
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d|>
filter(
rn_1 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)] |
rn_2 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]
)|>
filter(case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2 & dit_1>1094~ T, T~ F))|>
nrow()
cat(paste0("35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, tr. length > 3 yrs., cases: "))
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d|>
filter(
rn_1 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)] |
rn_2 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]
)|>
filter(case_when(trat_1_within_2==1 & ano_bd_2> ano_bd_1 & dit_2>1094~ T, T~ F))|>
nrow()
cat("The event of the left is the one that should be modified (aberant and largest treatment)")
replace_disch_date_35a21<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |>
filter(
rn_1 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)] |
rn_2 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]
)|>
filter(case_when(trat_2_within_1==1 & ano_bd_1> ano_bd_2 & dit_1>1094~ T, T~ F))|>
mutate(disch_date_num_rec_35a21= adm_date_rec_num_1-1)|>
select(rn_1, disch_date_num_rec_35a21)
cat("The event of the right (rn_2) is the one that should be modified (aberant and largest treatment)")
replace_disch_date_35a22<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d|>
filter(
rn_1 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)] |
rn_2 %in% row_35a_keep_check_after[!is.na(row_35a_keep_check_after)]
)|>
filter(case_when(trat_1_within_2==1 & ano_bd_2> ano_bd_1 & dit_2>1094~ T, T~ F))|>
mutate(disch_date_num_rec_35a22= adm_date_rec_num_1-1)|>
select(rn_2, disch_date_num_rec_35a22)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("2_4ab- Different center IDs; same SENDA financing statuses (Yes/No); no treatment in the middle of another; latest (not the earlier) treatment have a referral for cause of discharge; oldest episode should be modified")
mod_2_4ab<-
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d |>
#discard multiple treatments
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
#different CENTER ID
filter(same_id!=1)|>
#Same financing status
filter(grepl("both",senda_status))|>
#tr. not in the middle of another
filter(trat_1_within_2==0 & trat_2_within_1==0)|>
# referral in the cause of discharge of the treatment with latter discharge date (oldest)
mutate(crit_2_4ab= case_when(adm_date_rec_num_2<adm_date_rec_num_1 & tr_compliance_2=="referral"~ "2_4a.1. subtract days to second episode", adm_date_rec_num_1<adm_date_rec_num_2 & tr_compliance_1=="referral"~ "2_4a.2. subtract days to first episode", adm_date_rec_num_2<adm_date_rec_num_1 & tr_compliance_2!="referral"~ "2_4b.1. change cause of discharge and subtract days to second episode", adm_date_rec_num_1<adm_date_rec_num_2 & tr_compliance_1!="referral"~ "2_4b.2. change cause of discharge and subtract days to first episode", T~NA_character_ ))|> #janitor::tabyl(crit_2_4ab)
# which is the oldest
mutate(oldest= case_when(adm_date_rec_num_1<adm_date_rec_num_2~"oldest_1", T~ "oldest_2"))|>
#select the row of the treatment to discard and the row of the treatment to keep
#4a) change amount of days treated
mutate(disch_date_num_rec_2= case_when(grepl("\\.1\\.", crit_2_4ab)~ adm_date_rec_num_1-1, T~disch_date_num_2),
disch_date_num_rec_1= case_when(grepl("\\.2\\.", crit_2_4ab)~ adm_date_rec_num_2-1, T~disch_date_num_1))|>
#4b) also change cause of discharge,now only for those with another cause of discharge
mutate(tr_compliance_rec_2= case_when(grepl("2_4b\\.1", crit_2_4ab)~ "referral", T~tr_compliance_2),
tr_compliance_rec_1= case_when(grepl("2_4b\\.2", crit_2_4ab)~ "referral", T~tr_compliance_1))|>
mutate(rn_mod_1= case_when(grepl("\\.1\\.", crit_2_4ab)~ rn_1, grepl("\\.2\\.", crit_2_4ab)~ rn_2))|>
(\(df) {
cat(paste0("4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: ", formatC(nrow(subset(df, grepl("^2_4a\\.1", crit_2_4ab)))*2, big.mark=",")),"\n")
cat(paste0("4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_4a\\.1", crit_2_4ab)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: ", formatC(nrow(subset(df, grepl("^2_4a\\.2", crit_2_4ab)))*2, big.mark=",")),"\n")
cat(paste0("4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_4a\\.2", crit_2_4ab)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: ", formatC(nrow(subset(df,grepl("^2_4b\\.1", crit_2_4ab)))*2, big.mark=",")),"\n")
cat(paste0("4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_4b\\.1", crit_2_4ab)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: ", formatC(nrow(subset(df,grepl("^2_4b\\.2", crit_2_4ab)))*2, big.mark=",")),"\n")
cat(paste0("4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_4b\\.2", crit_2_4ab)), hash_key)), big.mark=",")),"\n")
df
})()|>
select(hash_key, rn_1, rn_2, oldest, disch_date_num_rec_1, disch_date_num_rec_2, tr_compliance_rec_1, tr_compliance_rec_2, crit_2_4ab)
#select the row of the treatment to discard and the row of the treatment to keep
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 118
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 59
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 0
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 0
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 66
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 33
# june 2025
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 118
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 59
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 0
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 0
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 68
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 34
# crit_2_4ab n percent
# 2_4a.1. substract days to second episode 1 0.01075269
# 2_4a.2. substract days to first episode 59 0.63440860
# 2_4b.2. change cause of discharge and substract days to first episode 33 0.35483871
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("(==========================================================================)\n")
cat("2_2b-Different center ID, Different SENDA financing status, earlier treatment financed by SENDA")
CONS_C1_df_dup_overlaps_COMP_after_miss_less30d_0d|>
#discard multiple treatments
filter(!(rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
#different CENTER ID
filter(same_id!=1)|>
#Different financing status
filter(!grepl("both",senda_status))|>
#select the row of the earliest treatment and check if it is financed by SENDA
mutate(crit_2_2b12= case_when(adm_date_rec_num_2>adm_date_rec_num_1 & senda_2=="si"~ "2_2b1.1. earliest treatment (2nd) financed by SENDA", adm_date_rec_num_1>adm_date_rec_num_2 & senda_1=="si"~ "2_2b1.2. earliest treatment (1st) financed by SENDA", adm_date_rec_num_2>adm_date_rec_num_1 & senda_2=="no"~ "2_2b2.1. earliest treatment (2nd) not financed by SENDA", adm_date_rec_num_1>adm_date_rec_num_2 & senda_1=="no"~ "2_2b2.2. earliest treatment (1st) not financed by SENDA", T~NA_character_ ))|> # janitor::tabyl(crit_2b12)
#select the row of the treatment to discard and the row of the treatment to keep
mutate(rn_keep= case_when(grepl("^2_2b1\\.1", crit_2_2b12)~ rn_2, grepl("^2_2b1\\.2", crit_2_2b12)~ rn_1, grepl("^2b2\\.1", crit_2_2b12)~ rn_2, grepl("^2_2b2\\.2", crit_2_2b12)~ rn_1))|>
mutate(rn_disc= case_when(grepl("^2_2b1\\.1", crit_2_2b12)~ rn_1, grepl("^2_2b1\\.2", crit_2_2b12)~ rn_2, grepl("^2b2\\.1", crit_2_2b12)~ rn_1, grepl("^2_2b2\\.2", crit_2_2b12)~ rn_2))|> #janitor::tabyl(crit_2_2b12)
(\(df) {
cat(paste0("4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), cases: ", formatC(nrow(subset(df, grepl("^2_2b1\\.1", crit_2_2b12)))*2, big.mark=",")),"\n")
cat(paste0("4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_2b1\\.1", crit_2_2b12)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), cases: ", formatC(nrow(subset(df, grepl("^2_2b1\\.2", crit_2_2b12)))*2, big.mark=",")),"\n")
cat(paste0("4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_2b1\\.2", crit_2_2b12)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), cases: ", formatC(nrow(subset(df,grepl("^2_2b2\\.1", crit_2_2b12)))*2, big.mark=",")),"\n")
cat(paste0("4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_2b2\\.1", crit_2_2b12)), hash_key)), big.mark=",")),"\n")
cat(paste0("4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), cases: ", formatC(nrow(subset(df,grepl("^2_2b2\\.2", crit_2_2b12)))*2, big.mark=",")),"\n")
cat(paste0("4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), RUNs: ", formatC(nrow(distinct(subset(df,grepl("^2_2b2\\.2", crit_2_2b12)), hash_key)), big.mark=",")),"\n")
pull(df, rn_disc) ->> row_2_2b_discard_1st_tr
pull(df, rn_keep) ->> row_2_2b_keep_2nd_tr
})()
# 4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), cases: 4
# 4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), RUNs: 2
# 4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), cases: 16
# 4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), RUNs: 8
# 4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), cases: 70
# 4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 35
# 4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), cases: 22
# 4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), RUNs: 11
# june 2025
# 4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), cases: 6
# 4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), RUNs: 3
# 4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), cases: 16
# 4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), RUNs: 8
# 4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), cases: 72
# 4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 36
# 4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), cases: 22
# 4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), RUNs: 11 (==========================================================================)
Same center ID, Both tr. SENDA Yes/No, An episode in the middle of the other4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, cases: 116
4.3a. same center, both financed/not financed by SENDA, one episode in the middle of the other, RUNs: 58
Same center ID; Both treatments are SENDA Yes or No= No; oldest episode should be modified4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2
4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1
4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 38
4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 19
4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 4
4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 2
4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 74
4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 37
(==========================================================================)
2b-Same center ID, Different SENDA financing status4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), cases: 4
4.2b1.1 same center, one financed by SENDA, earliest treatment (2nd), RUNs: 2
4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), cases: 8
4.2b1.2 same center, one financed by SENDA, earliest treatment (1st), RUNs: 4
4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), cases: 16
4.2b2.1 same center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 8
4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), cases: 10
4.2b2.2 same center, one financed by SENDA, not the earliest treatment (1st), RUNs: 5
(==========================================================================)
35ab- Different center IDs; same SENDA financing statuses (Yes/No); one episode in the middle of the other; and the earliest treatment comes from a more recent yearly database4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, cases: 68
4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, RUNs: 34
4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, cases: 54
4.35b. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was not earliest, RUNs: 27
(==========================================================================)
35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, tr. length > 3 yrs., cases: [1] 0
35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, retrieval date was the earliest, tr. length > 3 yrs., cases: [1] 4
The event of the left is the one that should be modified (aberant and largest treatment)The event of the right (rn_2) is the one that should be modified (aberant and largest treatment)(==========================================================================)
2_4ab- Different center IDs; same SENDA financing statuses (Yes/No); no treatment in the middle of another; latest (not the earlier) treatment have a referral for cause of discharge; oldest episode should be modified4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases: 2
4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (2nd), RUNs: 1
4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), cases: 118
4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral (1st), RUNs: 59
4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases: 0
4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), RUNs: 0
4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), cases: 68
4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral (1st), RUNs: 34
(==========================================================================)
2_2b-Different center ID, Different SENDA financing status, earlier treatment financed by SENDA4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), cases: 6
4.2_2b1.1 different center, one financed by SENDA, earliest treatment (2nd), RUNs: 3
4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), cases: 16
4.2_2b1.2 different center, one financed by SENDA, earliest treatment (1st), RUNs: 8
4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), cases: 72
4.2_2b2.1 different center, one financed by SENDA, not the earliest treatment (2nd), RUNs: 36
4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), cases: 22
4.2_2b2.2 different center, one financed by SENDA, not the earliest treatment (1st), RUNs: 11
We apply these rules into the dataset and check overlappings again
Code
# 2_4ab- Different center IDs; same SENDA financing statuses (Yes/No); no treatment in the middle of another; latest (not the earlier) treatment have a referral for cause of discharge
# 4.2_4a. different center, both financed/not financed by SENDA, oldest episode had referral
# 4.2_4b. different center, both financed/not financed by SENDA, oldest episode w/o referral
# 2_2b-Different center ID, Different SENDA financing status, earlier treatment financed by SENDA
# 2_2b1.1 different center, one financed by SENDA, earliest treatment
# 2_2b2.1 different center, one financed by SENDA, not the earliest treatment
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
SISTRAT23_c1_2010_2022_df_prev1l<-
SISTRAT23_c1_2010_2022_df_prev1k|>
(\(df) {
cat(paste0("4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
filter(!(rn %in% row_3a_discard_1st_tr))|>
mutate(OBS= case_when(rn %in% row_3a_keep_2nd_tr~ paste0(as.character(OBS),";","4.3a.Same center ID, same SENDA financing status, one tr. episode in the middle of the other. Discarded shorter episode"), T~ OBS))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
left_join(mod_4ab, by=c("rn"="rn_1"), suffix=c("","_4ab_1st_ep"))|>
left_join(mod_4ab, by=c("rn"="rn_2"), suffix=c("","_4ab_2nd_ep"))|>
#given it is paired with row numbers. The matching rownumbers will be those paired
mutate(disch_date_num_rec3= case_when(oldest=="oldest_1"~ disch_date_num_rec_1, T~ disch_date_num_rec2))|>
#because the second episode matched with the row number of the second column
mutate(disch_date_num_rec3= case_when(oldest_4ab_2nd_ep=="oldest_2"~ disch_date_num_rec_2_4ab_2nd_ep, T~ disch_date_num_rec3))|>
# change cause of discharge to referral
mutate(tr_compliance_rec= case_when(oldest=="oldest_1"~ "referral", T~ tr_compliance_rec))|>
mutate(tr_compliance_rec= case_when(oldest_4ab_2nd_ep=="oldest_2"~ "referral", T~ tr_compliance_rec))|>
# add observations to rows.
mutate(OBS= case_when(oldest== "oldest_1" & grepl("^4b", crit_4ab)~ paste0(as.character(OBS),";","4.4b.Same center ID, same SENDA financing status, subtracted days to oldest episode and change discharge cause to referral"), T~ OBS))|>
mutate(OBS= case_when(oldest== "oldest_1" & grepl("^4a", crit_4ab)~ paste0(as.character(OBS),";","4.4a.Same center ID, same SENDA financing status, subtracted days to oldest episode"), T~ OBS))|>
mutate(OBS= case_when(oldest== "oldest_2" & grepl("^4b", crit_4ab_4ab_2nd_ep)~ paste0(as.character(OBS),";","4.4b.Same center ID, same SENDA financing status, subtracted days to oldest episode and change discharge cause to referral"), T~ OBS))|>
mutate(OBS= case_when(oldest== "oldest_2" & grepl("^4a", crit_4ab_4ab_2nd_ep)~ paste0(as.character(OBS),";","4.4a.Same center ID, same SENDA financing status, subtracted days to oldest episode"), T~ OBS))|>
#discard any columns related to the join with duplicates database
select(- (any_of(c(contains("4ab_2nd_ep"), contains("_4ab_1st_ep"))))) |>
select(-any_of(setdiff(colnames(mod_4ab), "hash_key")))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
filter(!(rn %in% c(row_2b1_discard_1st_tr[!is.na(row_2b1_discard_1st_tr)], row_2b2_discard_1st_tr[!is.na(row_2b2_discard_1st_tr)])))|>
mutate(OBS= case_when(rn %in% row_2b1_keep_2nd_tr[!is.na(row_2b1_keep_2nd_tr)]~ paste0(as.character(OBS),";","4.2_2b1.Different center ID, earliest tr. financed by SENDA, kept the earliest"), T~ OBS))|>
mutate(OBS= case_when(rn %in% row_2b2_keep_2nd_tr[!is.na(row_2b2_keep_2nd_tr)]~ paste0(as.character(OBS),";","4.2_2b2.Different center ID, earliest tr. not financed by SENDA, kept the oldest"), T~ OBS))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
filter(!(rn %in% row_35b_discard_shortest[!is.na(row_35b_discard_shortest)]))|>
mutate(OBS= case_when(rn %in% row_35b_keep_largest[!is.na(row_35b_keep_largest)]~ paste0(as.character(OBS),";","4.35b.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earlier, kept the lagest"), T~ OBS))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
# row_35a_keep_check_after row_35a_disc_check_after =need to check tr. duration. if >1045 and greater, then cut this treatment to the correct treatment wihtin
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
#replace_disch_date_35a22 replace_disch_date_35a21
#row_35a_disc_check_after row_35a_keep_check_after #4.35a. different center, both financed/not financed by SENDA, one episode in the middle of the other, yearly retrieval date was the earliest
mutate(OBS= case_when(rn %in% setdiff(row_35a_keep_check_after, replace_disch_date_35a22$rn_2[!is.na(replace_disch_date_35a22$rn_2)])~ paste0(as.character(OBS),";","4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards"), T~ OBS))|>
mutate(OBS= case_when(rn %in% setdiff(row_35a_disc_check_after, replace_disch_date_35a22$rn_2[!is.na(replace_disch_date_35a22$rn_2)])~ paste0(as.character(OBS),";","4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards"), T~ OBS))|>
mutate(OBS= case_when(rn %in% setdiff(row_35a_keep_check_after, replace_disch_date_35a22$rn_1[!is.na(replace_disch_date_35a21$rn_1)])~ paste0(as.character(OBS),";","4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards"), T~ OBS))|>
mutate(OBS= case_when(rn %in% setdiff(row_35a_disc_check_after, replace_disch_date_35a22$rn_1[!is.na(replace_disch_date_35a21$rn_1)])~ paste0(as.character(OBS),";","4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards"), T~ OBS))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
# for the cases with >1094 days in tr.
left_join(replace_disch_date_35a21, by=c("rn"="rn_1"), suffix= c("","_35a21"))|>
left_join(replace_disch_date_35a22, by=c("rn"="rn_2"), suffix= c("","_35a22"))|>
mutate(disch_date_num_rec4= case_when(!is.na(disch_date_num_rec_35a21)~ disch_date_num_rec_35a21, T~ disch_date_num_rec3))|>
mutate(disch_date_num_rec4= case_when(!is.na(disch_date_num_rec_35a22)~ disch_date_num_rec_35a22, T~ disch_date_num_rec4))|>
mutate(OBS= case_when(rn %in% c(replace_disch_date_35a21$rn_1, replace_disch_date_35a22$rn_2)~ paste0(as.character(OBS),";","4.35a2.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, more 1094 days in tr., substract days in tr."), T~ OBS))|>
select(-disch_date_num_rec_35a21, -disch_date_num_rec_35a22)|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
left_join(mod_2_4ab, by=c("rn"="rn_1"), suffix=c("","_2_4ab_1st_ep"))|>
left_join(mod_2_4ab, by=c("rn"="rn_2"), suffix=c("","_2_4ab_2nd_ep"))|>
#Rows: 150,081
#given it is paired with row numbers. The matching rownumbers will be those paired
mutate(disch_date_num_rec5= case_when(oldest=="oldest_1"~ disch_date_num_rec_1, T~ disch_date_num_rec4))|>
#because the second episode matched with the row number of the second column
mutate(disch_date_num_rec5= case_when(oldest_2_4ab_2nd_ep=="oldest_2"~ disch_date_num_rec_2_2_4ab_2nd_ep, T~ disch_date_num_rec5))|>
# change cause of discharge to referral
mutate(tr_compliance_rec= case_when(oldest=="oldest_1"~ "referral", T~ tr_compliance_rec))|>
mutate(tr_compliance_rec= case_when(oldest_2_4ab_2nd_ep=="oldest_2"~ "referral", T~ tr_compliance_rec))|>
# add observations to rows.
mutate(OBS= case_when(oldest== "oldest_1" & grepl("^2_4b", crit_2_4ab)~ paste0(as.character(OBS),";","4.2_4b.Different center ID, same SENDA financing status, subtracted days to oldest episode and change discharge cause to referral"), T~ OBS))|>
mutate(OBS= case_when(oldest== "oldest_1" & grepl("^2_4a", crit_2_4ab)~ paste0(as.character(OBS),";","4.2_4a.Differnt center ID, same SENDA financing status, subtracted days to oldest episode"), T~ OBS))|>
mutate(OBS= case_when(oldest== "oldest_2" & grepl("^2_4b", crit_2_4ab_2_4ab_2nd_ep)~ paste0(as.character(OBS),";","4.2_4b.Different center ID, same SENDA financing status, subtracted days to oldest episode and change discharge cause to referral"), T~ OBS))|>
mutate(OBS= case_when(oldest== "oldest_2" & grepl("^2_4a", crit_2_4ab_2_4ab_2nd_ep)~ paste0(as.character(OBS),";","4.2_4a.Different center ID, same SENDA financing status, subtracted days to oldest episode"), T~ OBS))|>
#discard any columns related to the join with duplicates database
select(- (any_of(c(contains("2_4ab_2nd_ep"), contains("2_4ab_1st_ep")))))|>
select(-any_of(setdiff(colnames(mod_2_4ab), "hash_key")))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
mutate(disch_date_num_rec5= disch_date_num_rec5, dit_rec5= disch_date_num_rec5- adm_date_rec_num, disch_date_rec5= as.Date(disch_date_num_rec5, "1970-01-01"))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
(\(df) {
cat(paste0("4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1k))stop("Error: Added treatment episodes in the process")
df
})()
# 4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: 150,181
# 4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: 106,283
# 4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: 150,077
# 4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: 106,283
#4.4a. same center, both financed/not financed by SENDA, oldest episode had referral (2nd), cases // 4.4b. same center, both financed/not financed by SENDA, oldest episode w/o referral (2nd), cases:4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: 150,181
4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: 106,283
4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, cases: 150,077
4. Database before apply rules based on center ID, SENDA financing status, referral cause and treatment length, RUNs: 106,283
The generated database is called SISTRAT23_c1_2010_2022_df_prev1l. We included the variables disch_date_rec, dit_rec5, and disch_date_num_rec5, as the result of replacing values for the deduplication process.
Probabilistic Matches
We selected matches in the rows with the same hash, treatment center ID, date of admission, type of plan, educational attainment, sex and ID, within ages and with a match score greater than or equal to 70.
Code
library(fastLink)
Sys.setenv(OMP_NUM_THREADS = parallel::detectCores()- 1)
cores_min_1 <- parallel::detectCores()- 1
data <- as_tidytable(mutate(SISTRAT23_c1_2010_2022_df_prev1l, across(where(is.character), ~ifelse(is.na(.), "", .))) |> mutate(id_match = row_number()))
# Create a safe function to handle record linkage by block
process_block <- function(block_data) {
# Skip if too few records
if(nrow(block_data) <= 1) return(tidytable())
# Create copies for comparison
data1 <- data2 <- block_data
# Identify columns for comparison
varnames <- c("hash_key", "adm_date_rec_num", "disch_date_num_rec5", "plan_type", "id_centro", "ed_attainment", "sexo", "TABLE_rec")
# Convert columns to character and handle NAs
for(col in varnames) {
if(col %in% names(data1)) {
data1 <- data1|>
mutate({{ col }} := as.character(get(col)))|>
mutate({{ col }} := ifelse(is.na(get(col)), "", get(col)))
data2 <- data2|>
mutate({{ col }} := as.character(get(col)))|>
mutate({{ col }} := ifelse(is.na(get(col)), "", get(col)))
} else {
data1 <- data1|> mutate({{ col }} := "")
data2 <- data2|> mutate({{ col }} := "")
}
}
# Use tryCatch to handle errors
result <- tryCatch({
# Run fastLink
fl_out <- fastLink(
dfA = data1,
dfB = data2,
varnames = setdiff(varnames, c("adm_date_rec_num", "disch_date_num_rec5")),
stringdist.match = setdiff(varnames, c("adm_date_rec_num", "disch_date_num_rec5")),
threshold.match = 0.9,
n.cores = cores_min_1
)
# Extract matches
if(length(fl_out$matches$inds.a) > 0) {
tidytable(
id_match_1 = data1$id_match[fl_out$matches$inds.a],
id_match_2 = data2$id_match[fl_out$matches$inds.b],
disch_date_num_rec5_1 = data1$disch_date_num_rec5[fl_out$matches$inds.a],
disch_date_num_rec5_2 = data2$disch_date_num_rec5[fl_out$matches$inds.b],
adm_date_rec_num_1 = data1$adm_date_rec_num[fl_out$matches$inds.a],
adm_date_rec_num_2 = data2$adm_date_rec_num[fl_out$matches$inds.b],
match_score = fl_out$posterior
)|>
filter(id_match_1 < id_match_2)|> # Remove self-matches
mutate(disch_date_num_rec5_1= ifelse(disch_date_num_rec5_1=="", 19475, as.numeric(disch_date_num_rec5_1)))|>
mutate(disch_date_num_rec5_2= ifelse(disch_date_num_rec5_2=="", 19475, as.numeric(disch_date_num_rec5_2)))|>
mutate(adm_date_rec_num_1= as.numeric(adm_date_rec_num_1))|>
mutate(adm_date_rec_num_2= as.numeric(adm_date_rec_num_2))|>
mutate(overlap= case_when(
adm_date_rec_num_1 < disch_date_num_rec5_2 & # x Admitted before being admitted into another treatment
disch_date_num_rec5_1 > adm_date_rec_num_2~ 1,T~0))# x Discharged after being discharged from another treatment
} else {
tidytable()
}
}, error = function(e) {
cat("Error in block:", unique(block_data$yr_block), "\n")
print(e)
tidytable()
})
return(result)
}
# Process by blocks
all_matches <- tidytable()
blocks <- unique(data$yr_block)
for(b in blocks) {
cat("Processing block:", b, "\n")
block_data <- data %>% filter(yr_block == b)
block_matches <- process_block(block_data)
if(nrow(block_matches) > 0) {
all_matches <- bind_rows(all_matches, block_matches)
}
}
# Join with original data
final_results <- data|>
left_join(all_matches, by = c("id_match" = "id_match_1"))|>
mutate(match_score= sprintf("%1.2f", match_score))
#rio::export(final_results,"E:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out/final_results.rds")Code
final_results_overlap <-
final_results|> filter(overlap==1)|> mutate(comb_hash_adm_date= paste0(hash_key,"_",adm_date_rec))|>
select(any_of(c("comb_hash_adm_date", "adm_date_rec_num", "disch_date_num_rec5", "plan_type", "id_centro", "ed_attainment", "sexo", "TABLE_rec", "id_match", "id_match_2", "disch_date_num_rec5_1", "disch_date_num_rec5_2", "adm_date_rec_num_1", "adm_date_rec_num_2", "match_score")))
cat(paste0("Overlaps w/ >.90 match score: ", formatC(nrow(final_results_overlap), big.mark=",")),"\n")This approach has proven not useful, so we went back to the straight detection of overlappings.
0.c. Resolution of most problematic cases and multiple overlaps
We apply the rules to detect for overlaps again. We used the SISTRAT23_c1_2010_2022_df_prev1l dataset, which is the one that has been cleaned of duplicates and has the new variables.
Code
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id<-
SISTRAT23_c1_2010_2022_df_prev1l|>
mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec5), 19475, disch_date_num_rec5))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
rename("hash_key_2"="hash_key", "rn2"="rn")|>
select(rn2, hash_key_2, TABLE, adm_age_rec, adm_date_rec, adm_date_rec_num , disch_date_rec0, disch_date_num_miss, dit_rec5, id_centro, tr_compliance_rec, plan_type, senda)|>
#dplyr::filter(motivodeegreso!="Derivación")|>
data.table::as.data.table()
overlap_dates_C1_after_miss_less30d_0d_center_id <- janitor::clean_names(
sqldf::sqldf(
"
SELECT *
FROM CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id AS x
INNER JOIN CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id AS y
ON x.hash_key_2 = y.hash_key_2
AND x.rn2 < y.rn2 -- Avoids duplicates (eg.: x vs y and then y vs x)
AND x.adm_date_rec_num < y.disch_date_num_miss -- x Admitted before being admitted into another treatment
AND x.disch_date_num_miss > y.adm_date_rec_num -- x Discharged after being admitted in other
"
))|>
`colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1", "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2"))
cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss_less30d_0d_center_id)),"\n")
cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss_less30d_0d_center_id, hash_key_1))))
#Number of overlapped dates, observations: 306 # 266 # 263 june 2025
#Number of overlapped dates, RUNs: 170 # 156 # 154 june 2025
#The rows on the left originate from older databases.
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id <-
as_tidytable(overlap_dates_C1_after_miss_less30d_0d_center_id)|>
mutate(pair_id= paste0(rn_1,"_",rn_2))|>
mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no", T~NA_character_))|>
mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
mutate(trat_2_within_1=ifelse(disch_date_num_2<disch_date_num_1 & adm_date_rec_num_2>adm_date_rec_num_1,1,0))|>
select(-hash_key_2) |>
rename("hash_key"="hash_key_1")
warning("2025-04-09: The conditions now should be that the row number is present in the Excel file and also in the rows vector where more than one overlap was detected. Otherwise, outdated cases will be corrected, which, due to the correction of the truncation date in the 2019 database, are no longer valid as overlaps.")Warning: 2025-04-09: The conditions now should be that the row number is present in the Excel file and also in the rows vector where more than one overlap was detected. Otherwise, outdated cases will be corrected, which, due to the correction of the truncation date in the 2019 database, are no longer valid as overlaps.
Number of overlapped dates, observations: 263
Number of overlapped dates, RUNs: 154
Code
cat("Explore whether there are more than one overlapping treatment episodes within the same center ID, and if so, how many times it occurs, after replacing center ID and previous steps in overlappings.\n")
overlaps_after_miss_appear_more_than_one_time_post_center_id<-
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|>
tidytable::pivot_longer(
cols = matches("_[12]$"), # All columns ending with _1 or _2
names_to = c(".value", "wave"),
names_pattern = "(.+)_([12])",
values_drop_na = FALSE) |>
group_by(rn) |>
count() |>
filter(n>1) |> pull(rn)
cat("Have they changed?")
!identical(overlaps_after_miss_appear_more_than_one_time, overlaps_after_miss_appear_more_than_one_time_post_center_id)
cat(paste0("Number of overlaps after replacing center ID, episodes: ", formatC(length(overlaps_after_miss_appear_more_than_one_time_post_center_id), big.mark=",")),"\n")
cat(paste0("Number of overlapping combinations after replacing center IDs: ", formatC(nrow(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|> filter((rn_1 %in% overlaps_after_miss_appear_more_than_one_time | rn_2 %in% overlaps_after_miss_appear_more_than_one_time))), big.mark=",")),"\n")
#Number of overlaps after replacing center ID, episodes: 106 # 105 june 2025
#Number of overlapping combinations after replacing center IDs: 176 # 174 june 2025
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|>
# filter((rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
# rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
# filter((rn_1 %in% overlaps_after_miss_appear_more_than_one_time_post_center_id |
# rn_2 %in% overlaps_after_miss_appear_more_than_one_time_post_center_id))
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|>
filter((rn_1 %in% overlaps_after_miss_appear_more_than_one_time |
rn_2 %in% overlaps_after_miss_appear_more_than_one_time))|>
(\(df) {
cat(paste0("More than one overlapping, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("More than one overlapping, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
#export HASHes to study them
distinct(df, hash_key)|> pull(hash_key) ->> hash_multiple_overlaps_after_center_id
})()
# More than one overlapping, cases: 176 # 174 june 2025
# More than one overlapping, RUNs: 68 #67 june 2025
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
more_one_overlap_after_center_id<-
SISTRAT23_c1_2010_2022_df_prev1l|>
filter(hash_key %in% as.character(hash_multiple_overlaps_after_center_id))|>
mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec5), 19475, disch_date_num_rec5))|> #equivalent to 2023-04-28 as.numeric(as.Date("2023-01-01"))
select(hash_key, rn, TABLE_rec, adm_age_rec, senda_adm_date, adm_date_rec, adm_date_rec_num , disch_date_rec5, disch_date_num_miss, dit_rec5, id_centro, tr_compliance, plan_type, senda)|>
mutate(hash_key=factor(hash_key))
invisible("To check problematic cases\n")
if(length(ls()[grepl("no_mostrar", ls())])>0){
SISTRAT23_c1_2010_2022_df_prev1l|>
filter(hash_key=="0d3452833c9825ed178e4aea8da2bd30f86b1e5e1839fdc57e7e446105bcedde")|>
mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec5), 19475, disch_date_num_rec5))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
select(rn, hash_key, TABLE_rec, adm_age_rec, senda_adm_date, adm_date_rec, adm_date_rec_num , disch_date, disch_date_num_miss, dit_rec5, id_centro, tr_compliance, plan_type, senda, OBS) |>
glimpse()
}
("To explore what are the rows that enter in conflict, to help us in the analysis of overlapings\n")
if(length(ls()[grepl("no_mostrar", ls())])>0){
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|>
filter(rn_1==76076|rn_2==76076) |> select(rn_1, rn_2, adm_date_1, disch_date_1, adm_date_2, disch_date_2)
opc <- c(10716,
2678,
5505)
SISTRAT23_c1_2010_2022_df_prev1l|>
filter(rn %in% opc)|>
select(OBS)
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id|>
filter((rn_1 %in% opc |
rn_2 %in% opc))|>
select(rn_1, rn_2, adm_date_1, disch_date_1, adm_date_2, disch_date_2, hash_key)
#3e97be604b540841225cf7948ed4e822c969cba6c5b6484c916e0bb109cd38e4
}
cat("We export the dataset with more than one overlap to check it manually.\n")
result_more_one_overlap_after_center_id <- aggregate(rn ~ hash_key, data = more_one_overlap_after_center_id|>
mutate(hash_key=as.numeric(hash_key)), FUN = function(x) paste(x, collapse = ","))
more_one_overlap_after_center_id|>
mutate(hash_key=as.numeric(hash_key))|>
rio::export(paste0(wdpath, "cons/_out/more_one_overlaps_after_center_id.xlsx"))Explore whether there are more than one overlapping treatment episodes within the same center ID, and if so, how many times it occurs, after replacing center ID and previous steps in overlappings.
Have they changed?[1] FALSE
Number of overlaps after replacing center ID, episodes: 105
Number of overlapping combinations after replacing center IDs: 174
More than one overlapping, cases: 174
More than one overlapping, RUNs: 67
[1] "To explore what are the rows that enter in conflict, to help us in the analysis of overlapings\n"
We export the dataset with more than one overlap to check it manually.
Summary of Manual Data Cleaning for Overlapping Treatments
Manual adjustments were made to resolve overlapping treatment episodes based on the following criteria:
- Prioritized SENDA Admission Date (
adm_date_senda): Usedadm_date_sendaoveradm_dateto resolve overlaps, especially for treatments >1094 days or from pre-2012 databases. - Handled Multiple Ongoing Treatments: Retained the most recent ongoing treatment; adjusted the previous discharge date to one day before the next admission.
- Prioritized Recent Data: When overlaps occurred between records from different database years, the record from the most recent year was kept.
- Managed Missing Discharge Dates: Replaced missing discharge dates with the subsequent admission date minus one day, if applicable.
- Removed Unreliable Long Treatments: Eliminated treatments >1094 days if they lacked SENDA funding or originated from pre-2012 databases without a discharge date.
- Addressed Short Overlaps (<15 days): Considered these minor discrepancies, likely due to administrative delays, and resolved by retaining the most plausible record.
- Handled Referrals: Prioritized referrals from more recent databases in case of overlaps. If one treatment absorbed others (especially if SENDA-funded and recent), only the absorbing record was kept.
- Noted Truncated 2019 Data: Acknowledged that treatments recorded in the 2019 database might be truncated as of Nov 13, 2019, using
dias_en_tratamientofor duration calculations. - Flagged Ongoing Status: Marked treatments listed as “ongoing” for future status updates.
Changes were in the following variables: adm_date_corrected, disch_date_rec5_corrected and tr_compliance_rec.
Code
multiple_overlaps_manual_correction<-
rio::import(paste0(wdpath, "cons/_out/more_one_overlaps_after_center_id_mod.xlsx"), sheet = "Hoja 1")
cat("Aggregated by rows so we can pair the previous and the updated manual correciton by rownumbers")
result_multiple_overlaps_manual_correction <- aggregate(rn ~ hash_key, data = multiple_overlaps_manual_correction, FUN = function(x) paste(x, collapse = ","))
cat("How many records were in the Excel file vs. this new one?\n")
if(more_one_overlap_after_center_id|>
mutate(hash_key=as.numeric(hash_key))|>
filter(!rn %in% multiple_overlaps_manual_correction$rn) |> nrow()>0){
stop("There are records in the new Excel file that are not in the old one")
} else {print(0)}
#comparison by row numbers combined contrasted
if(
result_more_one_overlap_after_center_id|>
left_join(result_multiple_overlaps_manual_correction, by="rn")|>
(\(df) {
colnames(df)<- c("actual_hash_key", "rownumbers_combined", "previous_review_hashkey")
df
})() |>
filter(is.na(actual_hash_key))|> nrow()>0){
stop("There are records in the new Excel file that are not in the old one")
}else {print(0)}
cat("I need to find hash_keys that share the same rownumbers")
cat("This means that i only need to update the combiniton of rownumbers that are needed only")
result_more_one_overlap_after_center_id_updated <-
result_more_one_overlap_after_center_id|>
left_join(result_multiple_overlaps_manual_correction, by="rn")|>
(\(df) {
colnames(df)<- c("actual_hash_key", "rownumbers_combined", "previous_review_hashkey")
df
})()|>
left_join(multiple_overlaps_manual_correction, by= c("previous_review_hashkey"="hash_key"))|>
(\(df) {
if(nrow(df)!= more_one_overlap_after_center_id|> nrow()){
stop("Different rows between the updated and the matched with the manual correction")}
df
})()|>
#obtain real encripted RUNs
left_join((mutate(more_one_overlap_after_center_id, hash_key_num=as.numeric(hash_key))[, c("hash_key", "hash_key_num")]|> distinct(hash_key, .keep_all=T)), by= c("actual_hash_key"="hash_key_num"))|>
(\(df) {
#check if there are updates in the criteria and information to judge overlappings
if(group_by(df, previous_review_hashkey)|> filter(any(rn %in% hashs_dates_updated_disch_date$rny))|> ungroup()|> nrow()>0){ warning("Updated discharge dates of 2019 are still being discussed")}
if(group_by(df, previous_review_hashkey)|> filter(any(rn %in% rows_truncated_treatments_due_to_retrieval_2019))|> ungroup()|> nrow()>0){ warning("Missing discharge dates of pre-0 are still being discussed")}
df
})()Warning in (function(df) {: Missing discharge dates of pre-0 are still being discussed
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
result_more_one_overlap_after_center_id_updated|>
filter(OBS == "eliminar")|>
(\(df) {
cat(paste0("4.0c.1.Delete tr. episodes, multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.0c.1.Delete tr. episodes, multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df|>
pull(rn) ->> row_40c_delete_tr_episodes
df|>
left_join(more_one_overlap_after_center_id[,c("rn","hash_key")], by = "rn", multiple = "first")|>
pull(hash_key.y)|> as.character() ->> hashes_40c_delete_tr_episodes
})()
# 4.0c.1.Delete tr. episodes, multiple overlappings, cases: 35
# 4.0c.1.Delete tr. episodes, multiple overlappings, RUNs: 28
#june 2025
# 4.0c.1.Delete tr. episodes, multiple overlappings, cases: 31
# 4.0c.1.Delete tr. episodes, multiple overlappings, RUNs: 24
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
result_more_one_overlap_after_center_id_updated|>
filter(!is.na(disch_date_rec5_corrected))|>
(\(df) {
cat(paste0("4.0c.2.Replace discharge dates, multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.0c.2.Replace discharge dates, multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df|>
select(rn, disch_date_rec5_corrected)|>
mutate(disch_date_rec5_corrected= as.Date(as.character(disch_date_rec5_corrected))) ->> row_40c_replace_disch_dates
})()
# 4.0c.2.Replace discharge dates, multiple overlappings, cases: 62
# 4.0c.2.Replace discharge dates, multiple overlappings, RUNs: 44
#june 2025
# 4.0c.2.Replace discharge dates, multiple overlappings, cases: 56
# 4.0c.2.Replace discharge dates, multiple overlappings, RUNs: 38
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
result_more_one_overlap_after_center_id_updated|>
filter(!is.na(adm_date_corrected))|>
(\(df) {
cat(paste0("4.0c.3.Replace admission date, multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.0c.3.Replace admission date, multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df|>
select(rn, adm_date_corrected)|>
mutate(adm_date_corrected= as.Date(as.character(adm_date_corrected))) ->> row_40c_replace_adm_dates
})()
# 4.0c.3.Replace admission date, multiple overlappings, cases: 42
# 4.0c.3.Replace admission date, multiple overlappings, RUNs: 33
#june2025
# 4.0c.3.Replace admission date, multiple overlappings, cases: 42
# 4.0c.3.Replace admission date, multiple overlappings, RUNs: 33
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
result_more_one_overlap_after_center_id_updated|>
filter(!is.na(tr_compliance_rec))|>
(\(df) {
cat(paste0("4.0c.4.Replace referral cause, multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.0c.4.Replace referral cause, multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df|>
select(rn, tr_compliance_rec)|>
mutate(tr_compliance_rec= as.character(tr_compliance_rec)) ->> row_40c_replace_referral
})()
# 4.0c.4.Replace referral cause, multiple overlappings, cases: 5
# 4.0c.4.Replace referral cause, multiple overlappings, RUNs: 5
# june2025
# 4.0c.4.Replace referral cause, multiple overlappings, cases: 2
# 4.0c.4.Replace referral cause, multiple overlappings, RUNs: 2
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("(________________________________________________)")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
SISTRAT23_c1_2010_2022_df_prev1m<-
SISTRAT23_c1_2010_2022_df_prev1l|>
(\(df) {
cat(paste0("4.0c. Database before apply rules based on multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.0c. Database before apply rules based on multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
filter(!(rn %in% row_40c_delete_tr_episodes))|>
mutate(OBS= case_when(hash_key %in% hashes_40c_delete_tr_episodes~ paste0(as.character(OBS),";","4.0c.1.Multiple overlappings, discarded tr. episodes"), T~ OBS))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
left_join(row_40c_replace_adm_dates, by="rn", suffix=c("","_40c3"))|>
left_join(row_40c_replace_disch_dates, by="rn", suffix=c("","_40c2"))|>
left_join(row_40c_replace_referral, by="rn", suffix=c("","_40c4"))|>
mutate(OBS= case_when(!is.na(adm_date_corrected)~ paste0(as.character(OBS),";","4.0c.3.Multiple overlappings, replace admission dates"), T~ OBS))|>
mutate(OBS= case_when(!is.na(disch_date_rec5_corrected)~ paste0(as.character(OBS),";","4.0c.2.Multiple overlappings, replace discharge dates"), T~ OBS))|>
mutate(OBS= case_when(!is.na(tr_compliance_rec_40c4)~ paste0(as.character(OBS),";","4.0c.4.Multiple overlappings, replace cause of discharge as referral"), T~ OBS))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
mutate(adm_date_rec2= case_when(!is.na(adm_date_corrected)~ adm_date_corrected, T~ adm_date_rec))|>
mutate(adm_date_num_rec2= case_when(!is.na(adm_date_corrected)~ as.numeric(adm_date_rec2), T~ as.numeric(adm_date_rec_num)))|>
mutate(disch_date_num_rec6= case_when(!is.na(disch_date_rec5_corrected)~ as.numeric(disch_date_rec5_corrected), T~ disch_date_num_rec5))|>
mutate(disch_date_rec6= case_when(!is.na(disch_date_rec5_corrected)~ disch_date_rec5_corrected, T~ disch_date_rec5))|>
mutate(tr_compliance_rec2= case_when(!is.na(tr_compliance_rec_40c4)~ tr_compliance_rec_40c4, T~ tr_compliance_rec))|>
mutate(adm_age_rec2=round(as.numeric((adm_date_rec2-birth_date_rec))/365.25,2))|>
#discard any columns related to the join to the main database
select(-(any_of(c("adm_date_corrected","disch_date_rec5_corrected"))))|>
select(-(contains("_40c4")))|>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
mutate(dit_rec6= disch_date_num_rec5- adm_date_num_rec2)|>
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Early vs. late dropout (2025-06-03, checked right)
mutate(dit_earl_drop_rec= ifelse(dit_rec6<90 & !is.na(dit_rec6),1,0))|>
mutate(dit_earl_drop= factor(dit_earl_drop_rec, labels=c(">= 90 days", "<90 days")))|> #t.test(dit_rec6~ dit_earl_drop_rec, data= SISTRAT23_c1_2010_2022_df_prev1m)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Treatment compliance
mutate(tr_compliance_rec3= case_when(grepl("<",dit_earl_drop) & grepl("drop", tr_compliance_rec2)~ "early dropout", grepl(">",dit_earl_drop) & grepl("drop", tr_compliance_rec2)~ "late dropout", grepl("<",dit_earl_drop) & grepl("adm dis", tr_compliance_rec2)~ "early adm discharge", grepl(">",dit_earl_drop) & grepl("adm dis", tr_compliance_rec2)~ "late adm discharge", grepl("completion",tr_compliance_rec2)~ "completion", grepl("death",tr_compliance_rec2)~ "death", grepl("referral",tr_compliance_rec2)~ "referral", grepl("adm tr",tr_compliance_rec2)~ "adm truncated", is.na(tr_compliance_rec2)~ "adm truncated", TRUE~ "currently in"))|>
#table(SISTRAT23_c1_2010_2022_df_prev1m$dit_earl_drop_rec,SISTRAT23_c1_2010_2022_df_prev1m$dit_earl_drop))
#filter(is.na(tr_compliance_rec2)) |> glimpse()
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
(\(df) {
cat(paste0("4.0c. Database after apply rules based on multiple overlappings, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.0c. Database after apply rules based on multiple overlappings, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1l))stop("Error: Added treatment episodes in the process")
df
})()
# 4.0c. Database before apply rules based on multiple overlappings, cases: 150,076
# 4.0c. Database before apply rules based on multiple overlappings, RUNs: 106,283
# 4.0c. Database after apply rules based on multiple overlappings, cases: 150,041
# 4.0c. Database after apply rules based on multiple overlappings, RUNs: 106,283 Aggregated by rows so we can pair the previous and the updated manual correciton by rownumbersHow many records were in the Excel file vs. this new one?
[1] 0
[1] 0
I need to find hash_keys that share the same rownumbersThis means that i only need to update the combiniton of rownumbers that are needed only4.0c.1.Delete tr. episodes, multiple overlappings, cases: 31
4.0c.1.Delete tr. episodes, multiple overlappings, RUNs: 24
4.0c.2.Replace discharge dates, multiple overlappings, cases: 56
4.0c.2.Replace discharge dates, multiple overlappings, RUNs: 38
4.0c.3.Replace admission date, multiple overlappings, cases: 42
4.0c.3.Replace admission date, multiple overlappings, RUNs: 33
4.0c.4.Replace referral cause, multiple overlappings, cases: 2
4.0c.4.Replace referral cause, multiple overlappings, RUNs: 2
4.0c. Database before apply rules based on multiple overlappings, cases: 150,077
4.0c. Database before apply rules based on multiple overlappings, RUNs: 106,283
4.0c. Database after apply rules based on multiple overlappings, cases: 150,046
4.0c. Database after apply rules based on multiple overlappings, RUNs: 106,283
The database SISTRAT23_c1_2010_2022_df_prev1m was generated by replacing the original admission and discharge dates, along with the causes of discharge. Subsequently, the following variables were added to this dataset: the revised discharge date (disch_date_rec6), its numeric representation (disch_date_num_rec6), and the calculated days in treatment (dit_rec6). The revised admission date resulting from the replacement was also included in the final dataset in its numeric (adm_date_num_rec2) and date (adm_date_rec2) format. Also, we generated tr_compliance_rec3 to recode cause of discharge according to changes made in days in treatment and overlapping correction. We also added dit_earl_drop_rec, a binary classification of treatments with less than 90 days.
We check again if there are overlaps after manual replacements.
Code
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps<-
SISTRAT23_c1_2010_2022_df_prev1m|>
mutate(disch_date_num_miss= ifelse(is.na(disch_date_num_rec6), 19475, disch_date_num_rec6))|> #equivalente a 2023-04-28 as.numeric(as.Date("2023-01-01"))
rename("hash_key_2"="hash_key", "rn2"="rn")|>
select(rn2, hash_key_2, TABLE, adm_age_rec2, adm_date_rec2, adm_date_num_rec2 , disch_date_rec6, disch_date_num_miss, dit_rec6, id_centro, tr_compliance_rec3, plan_type, senda)|>
#dplyr::filter(motivodeegreso!="Derivación")|>
data.table::as.data.table()
overlap_dates_C1_after_miss_less30d_0d_center_id_mult_overlap <- janitor::clean_names(
sqldf::sqldf(
"
SELECT *
FROM CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps AS x
INNER JOIN CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps AS y
ON x.hash_key_2 = y.hash_key_2
AND x.rn2 < y.rn2 -- Avoids duplicates (eg.: x vs y and then y vs x)
AND x.adm_date_num_rec2 < y.disch_date_num_miss -- x Admitted before being admitted into another treatment
AND x.disch_date_num_miss > y.adm_date_num_rec2 -- x Discharged after being admitted in other
"
))|>
`colnames<-`(c("rn_1", "hash_key_1", "ano_bd_1", "adm_age_1", "adm_date_1", "adm_date_rec_num_1", "disch_date_1", "disch_date_num_1", "dit_1", "id_centro_1", "tr_compliance_1", "plan_type_1", "senda_1", "rn_2", "hash_key_2", "ano_bd_2", "adm_age_2", "adm_date_2", "adm_date_rec_num_2", "disch_date_2", "disch_date_num_2", "dit_2", "id_centro_2", "tr_compliance_2", "plan_type_2", "senda_2"))
cat(paste0("Number of overlapped dates, observations: ", nrow(overlap_dates_C1_after_miss_less30d_0d_center_id_mult_overlap)),"\n")
cat(paste0("Number of overlapped dates, RUNs: ", nrow(distinct(overlap_dates_C1_after_miss_less30d_0d_center_id_mult_overlap, hash_key_1))))
#Number of overlapped dates, observations: 90
#Number of overlapped dates, RUNs: 89
# june 2025
#Number of overlapped dates, observations: 89
#Number of overlapped dates, RUNs: 88
#The rows on the left originate from older databases.
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps <-
as_tidytable(overlap_dates_C1_after_miss_less30d_0d_center_id_mult_overlap)|>
mutate(pair_id= paste0(rn_1,"_",rn_2))|>
mutate(same_id=ifelse(id_centro_1==id_centro_2,1,0))|>
mutate(bd_2_earlier=ifelse(ano_bd_2>ano_bd_1,1,0))|> #es el dato de la derecha de una base de datos mas reciente.
mutate(senda_status= case_when(senda_1=="si" & senda_2=="si"~ "both yes", senda_1=="no" & senda_2=="no"~ "both no", senda_1=="no" & senda_2=="si"~ "second yes", senda_1=="no" & senda_2=="no"~ "second no", T~NA_character_))|>
mutate(referral= ifelse(tr_compliance_1=="referral",1,0))|>
mutate(days_overlapped=disch_date_num_1-adm_date_rec_num_2)|> # para que hayan dias positivos. Se supone que la fecha de egreso es más reciente que la fecha de ingreso del evento que superpone.
mutate(more_dit=ifelse(dit_2>dit_1,1,0))|> #más días tratado en 2
mutate(trat_1_within_2=ifelse(disch_date_num_1<disch_date_num_2 & adm_date_rec_num_1>adm_date_rec_num_2,1,0))|>
mutate(trat_2_within_1=ifelse(disch_date_num_2<disch_date_num_1 & adm_date_rec_num_2>adm_date_rec_num_1,1,0))|>
select(-hash_key_2) |>
rename("hash_key"="hash_key_1")
warning("2025-04-09: The conditions now should be that the row number is present in the Excel file and also in the rows vector where more than one overlap was detected. Otherwise, outdated cases will be corrected, which, due to the correction of the truncation date in the 2019 database, are no longer valid as overlaps.")Warning: 2025-04-09: The conditions now should be that the row number is present in the Excel file and also in the rows vector where more than one overlap was detected. Otherwise, outdated cases will be corrected, which, due to the correction of the truncation date in the 2019 database, are no longer valid as overlaps.
Code
warning("2025-06-02: This was corrected partially, as 2019 updated dates were used.")Warning: 2025-06-02: This was corrected partially, as 2019 updated dates were used.
Number of overlapped dates, observations: 89
Number of overlapped dates, RUNs: 88
Code
CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps|>
left_join(SISTRAT23_c1_2010_2022_df_prev1m[, c("rn", "OBS")], by=c("rn_1"="rn"))|>
left_join(SISTRAT23_c1_2010_2022_df_prev1m[, c("rn", "OBS")], by=c("rn_2"="rn"), suffix=c("","_2nd"))|>
(\(df) {
mutate(df, hash_key= as.numeric(factor(hash_key)))|> rio::export("_out/_overlaps_dup_after_manual_imp.xlsx") #for visual comparison in excel
knitr::kable(filter(df, hash_key %in% pull(sample_n_with_seed(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps,20, seed=2125),"hash_key"))|> mutate(hash_key= as.numeric(factor(hash_key))), format = "html", format.args = list(decimal.mark = ".", big.mark = ","), caption="Cases with overlapped treatment ranges (after correcting for missing discharge dates)", align = rep('c', 32))|>
kableExtra::kable_classic()|>
kableExtra::scroll_box(height = "400px")
})()| rn_1 | hash_key | ano_bd_1 | adm_age_1 | adm_date_1 | adm_date_rec_num_1 | disch_date_1 | disch_date_num_1 | dit_1 | id_centro_1 | tr_compliance_1 | plan_type_1 | senda_1 | rn_2 | ano_bd_2 | adm_age_2 | adm_date_2 | adm_date_rec_num_2 | disch_date_2 | disch_date_num_2 | dit_2 | id_centro_2 | tr_compliance_2 | plan_type_2 | senda_2 | pair_id | same_id | bd_2_earlier | senda_status | referral | days_overlapped | more_dit | trat_1_within_2 | trat_2_within_1 | OBS | OBS_2nd |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19,764 | 1 | 2011 | 26.30 | 2011-11-14 | 15,292 | 2011-12-23 | 15,331 | 39 | 182 | early dropout | pg-pai | si | 37,150 | 2013 | 25.23 | 2010-10-19 | 14,901 | 2013-09-13 | 15,961 | 1,060 | 408 | referral | pg-pab | si | 19764_37150 | 0 | 1 | both yes | 0 | 430 | 1 | 1 | 0 | 2.1.1.b.The most common date is selected as the birth date;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards | 2.1.1.b.The most common date is selected as the birth date;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards |
| 47,306 | 2 | 2013 | 48.45 | 2013-12-17 | 16,056 | 2014-01-29 | 16,099 | 43 | 328 | referral | pg-pab | si | 53,893 | 2014 | 47.53 | 2013-01-17 | 15,722 | 2014-08-27 | 16,309 | 587 | 502 | completion | pg-pai | si | 47306_53893 | 0 | 1 | both yes | 1 | 377 | 1 | 1 | 0 | ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards | ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards |
| 45,670 | 3 | 2013 | 31.29 | 2013-10-21 | 15,999 | 2014-01-29 | 16,099 | 100 | 123 | referral | pg-pai | si | 53,680 | 2014 | 31.16 | 2013-09-02 | 15,950 | 2014-05-06 | 16,196 | 246 | 502 | late dropout | pg-pai | si | 45670_53680 | 0 | 1 | both yes | 1 | 149 | 1 | 1 | 0 | ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards | ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards |
| 35,463 | 4 | 2013 | 29.22 | 2013-01-14 | 15,719 | 2013-02-04 | 15,740 | 21 | 258 | early dropout | pg-pr | si | 36,422 | 2013 | 27.65 | 2011-06-21 | 15,146 | 2013-05-31 | 15,856 | 710 | 238 | referral | pg-pab | no | 35463_36422 | 0 | 0 | 0 | 594 | 1 | 1 | 0 | |||
| 204,449 | 5 | 2021 | 34.63 | 2021-11-30 | 18,961 | 2022-02-01 | 19,024 | 63 | 238 | early dropout | pg-pai | no | 210,941 | 2022 | 34.64 | 2021-12-07 | 18,968 | 2022-02-01 | 19,024 | 56 | 258 | early dropout | pg-pr | si | 204449_210941 | 0 | 1 | second yes | 0 | 56 | 0 | 0 | 0 | 1.1. Duplicated Cases in Almost Every Variable | |
| 5,572 | 6 | 2010 | 22.10 | 2010-01-25 | 14,634 | 2011-06-02 | 15,127 | 493 | 109 | referral | pg-pab | si | 6,177 | 2010 | 22.45 | 2010-06-01 | 14,761 | 2010-12-15 | 14,958 | 197 | 117 | late dropout | pg-pr | no | 5572_6177 | 0 | 0 | 1 | 366 | 0 | 0 | 1 | 1c.b.3.cases w/different discharge dates, removed entries w/ lower dit; 1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs; ;2.1.1.c.Multiple common dates found. Select the birth date closest to available external records | 2.1.1.c.Multiple common dates found. Select the birth date closest to available external records | |
| 5,014 | 7 | 2010 | 29.65 | 2009-04-27 | 14,361 | 2011-06-06 | 15,131 | 770 | 109 | referral | pg-pab | si | 10,410 | 2011 | 30.61 | 2010-04-12 | 14,711 | 2012-01-31 | 15,370 | 659 | 117 | completion | pg-pr | no | 5014_10410 | 0 | 1 | 1 | 420 | 0 | 0 | 0 | 1.1. Duplicated Cases in Almost Every Variable | ||
| 336 | 8 | 2010 | 21.88 | 2007-07-17 | 13,711 | 2010-02-01 | 14,641 | 945 | 118 | referral | pg-pai | si | 12,792 | 2011 | 24.35 | 2010-01-04 | 14,613 | 2012-02-01 | 15,371 | 758 | 118 | late adm discharge | pg-pai | si | 336_12792 | 1 | 1 | both yes | 1 | 28 | 0 | 0 | 0 | ;4.0c.2.Multiple overlappings, replace discharge dates | |
| 73,917 | 9 | 2015 | 20.94 | 2015-03-01 | 16,495 | 2015-07-27 | 16,643 | 148 | 148 | late dropout | pg-pai | si | 80,992 | 2015 | 21.20 | 2015-06-06 | 16,592 | 2016-02-01 | 16,832 | 240 | 146 | late dropout | pg-pr | no | 73917_80992 | 0 | 0 | 0 | 51 | 1 | 0 | 0 | 3.1. Collapsed Treatment Plans | ||
| 72,706 | 10 | 2015 | 41.28 | 2015-02-05 | 16,471 | 2015-10-02 | 16,710 | 239 | 316 | referral | pg-pai | si | 78,199 | 2015 | 41.67 | 2015-06-25 | 16,611 | 2015-06-27 | 16,613 | 2 | 336 | completion | pg-pr | no | 72706_78199 | 0 | 0 | 1 | 99 | 0 | 0 | 1 | 3.1. Collapsed Treatment Plans | ||
| 75,938 | 11 | 2015 | 40.73 | 2015-04-30 | 16,555 | 2015-06-10 | 16,596 | 41 | 609 | referral | pg-pab | si | 76,789 | 2015 | 40.71 | 2015-04-21 | 16,546 | 2015-06-30 | 16,616 | 70 | 146 | referral | pg-pr | no | 75938_76789 | 0 | 0 | 1 | 50 | 1 | 1 | 0 | 3.1. Collapsed Treatment Plans | ||
| 103,198 | 12 | 2016 | 38.98 | 2016-10-11 | 17,085 | 2016-12-02 | 17,137 | 52 | 141 | referral | pg-pai | no | 109,664 | 2017 | 39.00 | 2016-10-20 | 17,094 | 2017-07-13 | 17,360 | 266 | 142 | completion | m-pr | si | 103198_109664 | 0 | 1 | second yes | 1 | 43 | 1 | 0 | 0 | 1.1. Duplicated Cases in Almost Every Variable | |
| 34,549 | 13 | 2013 | 20.94 | 2012-11-12 | 15,656 | 2013-02-04 | 15,740 | 84 | 142 | early dropout | m-pr | si | 36,225 | 2013 | 20.71 | 2012-08-18 | 15,570 | 2013-10-10 | 15,988 | 418 | 146 | late dropout | pg-pab | no | 34549_36225 | 0 | 0 | 0 | 170 | 1 | 1 | 0 | 1.1. Duplicated Cases in Almost Every Variable;2.1.1.b.The most common date is selected as the birth date | 2.1.1.b.The most common date is selected as the birth date | |
| 6,300 | 14 | 2010 | 30.27 | 2010-05-27 | 14,756 | 2011-06-06 | 15,131 | 375 | 109 | referral | pg-pai | si | 11,408 | 2011 | 30.55 | 2010-09-08 | 14,860 | 2011-08-31 | 15,217 | 357 | 117 | late dropout | pg-pr | no | 6300_11408 | 0 | 1 | 1 | 271 | 0 | 0 | 0 | 1.1. Duplicated Cases in Almost Every Variable | ||
| 18,837 | 15 | 2011 | 31.76 | 2011-09-01 | 15,218 | 2011-09-07 | 15,224 | 6 | 275 | early dropout | m-pr | no | 21,776 | 2012 | 31.61 | 2011-07-07 | 15,162 | 2012-12-26 | 15,700 | 538 | 291 | referral | pg-pai | si | 18837_21776 | 0 | 1 | second yes | 0 | 62 | 1 | 1 | 0 | 1.1. Duplicated Cases in Almost Every Variable | |
| 45,033 | 16 | 2013 | 56.21 | 2013-09-11 | 15,959 | 2014-01-29 | 16,099 | 140 | 123 | referral | pg-pai | si | 54,497 | 2014 | 56.17 | 2013-08-26 | 15,943 | 2014-08-20 | 16,302 | 359 | 502 | completion | pg-pai | si | 45033_54497 | 0 | 1 | both yes | 1 | 156 | 1 | 1 | 0 | 2.1.1.b.The most common date is selected as the birth date;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards | 2.1.1.b.The most common date is selected as the birth date;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards |
| 35,519 | 17 | 2013 | 27.99 | 2013-01-04 | 15,709 | 2013-05-28 | 15,853 | 144 | 154 | referral | pg-pr | si | 36,433 | 2013 | 27.99 | 2013-01-01 | 15,706 | 2013-08-27 | 15,944 | 238 | 330 | referral | pg-pr | no | 35519_36433 | 0 | 0 | 1 | 147 | 1 | 1 | 0 | 3.1. Collapsed Treatment Plans | ||
| 44,506 | 18 | 2013 | 22.26 | 2013-09-06 | 15,954 | 2013-11-29 | 16,038 | 84 | 123 | early adm discharge | pg-pab | si | 53,864 | 2014 | 22.18 | 2013-08-08 | 15,925 | 2014-05-06 | 16,196 | 271 | 502 | late dropout | pg-pab | si | 44506_53864 | 0 | 1 | both yes | 0 | 113 | 1 | 1 | 0 | ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards | ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards |
| 5,613 | 19 | 2010 | 23.87 | 2010-05-06 | 14,735 | 2010-08-26 | 14,847 | 112 | 275 | referral | m-pr | si | 23,153 | 2012 | 23.83 | 2010-04-19 | 14,718 | 2012-10-31 | 15,644 | 926 | 259 | late dropout | pg-pai | si | 5613_23153 | 0 | 1 | both yes | 1 | 129 | 1 | 1 | 0 | ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset not earliest, less 1094 days in tr., check afterwards | 1.1. Duplicated Cases in Almost Every Variable;1c.a.3.cases w/different discharge dates, removed entries w/ lower dit; 1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs; ;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards;4.35a1.Different center ID, same SENDA financing status, one tr. in the middle of the other, yearly dataset earliest, less 1094 days in tr., check afterwards |
| 25,303 | 20 | 2012 | 18.79 | 2012-03-19 | 15,418 | 2019-12-31 | 18,261 | 2,843 | adm truncated | pg-pai | no | 29,037 | 2012 | 19.21 | 2012-08-20 | 15,572 | 2013-01-15 | 15,720 | 148 | referral | pg-pr | no | 25303_29037 | 0 | both no | 0 | 2,689 | 0 | 0 | 1 | 2.1.1.a.Less16|More90, removed rows due to >2 |diff| ;2.1.1.c.Multiple common dates found. Select the birth date closest to available external records; 4.pre. Missing discharge dates due administrative truncation in 2019, imputed | 2.1.1.a.Less16|More90, removed rows due to >2 |diff| ;2.1.1.c.Multiple common dates found. Select the birth date closest to available external records;3.1. Collapsed Treatment Plans;3.4. Invalid Age Of Onset of Substance use, <5 yrs old |
Provisionally, we generated the database SISTRAT23_c1_2010_2022_df_prev1n, which removes duplicates which were of more than 1095 days, within another treatment, if there are financed by SENDA and other not financed by SENDA, kept financed by SENDA, in order to continue reviewing the observations and normalizing data in other relevant aspects.
Code
SISTRAT23_c1_2010_2022_df_prev1n<-
SISTRAT23_c1_2010_2022_df_prev1m |>
#greater than 1095 days
dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, dit_1>1095, "rn_1")))|>
dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, dit_2>1095, "rn_2")))|>
#tr. within the other
dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, trat_1_within_2==1, "rn_1")))|>
dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, trat_2_within_1==1, "rn_2")))|>
#senda Yes
dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, senda_1=="no" & senda_2=="si", "rn_1")))|>
dplyr::filter(!(rn %in% subset(CONS_C1_df_dup_intervals_after_miss_less30d_0d_center_id_multiple_overlaps, senda_1=="si" & senda_2=="no", "rn_2"))) |>
#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;#;
(\(df) {
cat(paste0("4.0xx. Database after eliminating remanent duplicates, cases: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("4.0xx. Database after eliminating remanent duplicates, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1m))stop("Error: Added treatment episodes in the process")
df
})()4.0xx. Database after eliminating remanent duplicates, cases: 150,046
4.0xx. Database after eliminating remanent duplicates, RUNs: 106,283
1. Data Editing / Deductive Imputation
1.1. DSM/ICD-10
Some cases did not have a primary diagnosis in DSM-IV notation but have a secondary (n= 604) or tertiary but no secondary (n= 20).
The data uses a nested structure for main and sub-diagnostic categories. When an episode had a main diagnosis but was missing a sub-diagnosis, we inserted ‘NA’ as a placeholder (“NA_placeholder_”). We then removed duplicate entries among the second or third pair of main and sub-diagnoses. After this cleaning step, the diagnoses for each episode were concatenated
The replacement of DSM-IV diagnoses with ICD-10 codes is not recommended for our analysis yet. We lack documentation on the source of any intersection between these classification systems, and 31 sub-diagnoses have no direct equivalents between the two systems. This inconsistency would compromise the validity of our diagnostic categorization and subsequent analyses..
The main diagnoses and sub-diagnoses for ICD-10 and DSM-IV classification systems were combined into the mod_psiq_cie_10_or and mod_psiq_dsm_iv_or columns, respectively. In the future (step 4), they should be separated by column.
Additionally, the columns with suffixes _instudy (detects any “in study”), _no_dg (detects any “no disorder”), and _dg (detects any valid diagnostic) enable the identification of records where categories such as “sin trastorno” (no disorder) and “en estudio” (under study) can be removed, as these designations provide no clinical value when they occur alongside established diagnoses _dg).
Code
names_dg_dsmiv<-
c("diagnostico_trs_psiquiatrico_dsm_iv", "diagnostico_trs_psiquiatrico_sub_dsm_iv",
"x2_diagnostico_trs_psiquiatrico_dsm_iv", "x2_diagnostico_trs_psiquiatrico_sub_dsm_iv",
"x3_diagnostico_trs_psiquiatrico_dsm_iv", "x3_diagnostico_trs_psiquiatrico_sub_dsm_iv")
names_dg_icd10<- c("diagnostico_trs_psiquiatrico_cie_10", "diagnostico_trs_psiquiatrico_sub_cie_10",
"x2_diagnostico_trs_psiquiatrico_cie_10", "x2_diagnostico_trs_psiquiatrico_sub_cie_10",
"x3_diagnostico_trs_psiquiatrico_cie_10", "x3_diagnostico_trs_psiquiatrico_sub_cie_10",
"diagnostico_trastorno_psiquiatrico_cie_10_al_egreso")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#dg_trs_psiq_sub_cie_10_or x2_dg_trs_psiq_sub_cie_10_or x3_dg_trs_psiq_sub_cie_10_or
cat("Cases with sub-diagnostics but without the main: DSM-IV\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Cases with sub-diagnostics but without the main: ICD-10\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_cie_10) & !is.na(diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_cie_10) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_cie_10) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10)))|> nrow()
# 3 cases with sub-diagnostics
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#dg_trs_psiq_sub_cie_10_or x2_dg_trs_psiq_sub_cie_10_or x3_dg_trs_psiq_sub_cie_10_or
cat("Cases with sub-diagnostics but without the main: DSM-IV\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter(is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv)))|> nrow()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno): ICD-10\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(diagnostico_trs_psiquiatrico_cie_10) |
diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10))) |>
nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(x2_diagnostico_trs_psiquiatrico_cie_10) |
x2_diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
x2_diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10))) |>
nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(x3_diagnostico_trs_psiquiatrico_cie_10) |
x3_diagnostico_trs_psiquiatrico_cie_10 == "en estudio" |
x3_diagnostico_trs_psiquiatrico_cie_10 == "sin trastorno") & !is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10))|> select(c("hash_key",any_of(names_dg_icd10))) |>
nrow()
cat("Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno): DSM-IV\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(diagnostico_trs_psiquiatrico_dsm_iv) |
diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv))) |>
nrow()
cat("second dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(x2_diagnostico_trs_psiquiatrico_dsm_iv) |
x2_diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
x2_diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv))) |>
nrow()
cat("third dg.\n")
SISTRAT23_c1_2010_2022_df_prev1n|> filter((is.na(x3_diagnostico_trs_psiquiatrico_dsm_iv) |
x3_diagnostico_trs_psiquiatrico_dsm_iv == "en estudio" |
x3_diagnostico_trs_psiquiatrico_dsm_iv == "sin trastorno") & !is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|> select(c("hash_key",any_of(names_dg_dsmiv))) |>
nrow() #4
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Primero solucionar el problema de arriba: clasificaciones con en estudio, pero la subclasificación con diagnóstico (agregarle como condición que la subclas tenga también no_NAs")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("to standardize the main category with the DSM-IV subcategory\n")
dg_trs_psiq_dsm_iv_sub_tab<-
SISTRAT23_c1_2010_2022_df_prev1n|>
mutate(dsm_concat= paste0(diagnostico_trs_psiquiatrico_dsm_iv, "_", diagnostico_trs_psiquiatrico_sub_dsm_iv))|>
mutate(x2_dsm_concat= paste0(x2_diagnostico_trs_psiquiatrico_dsm_iv, "_", x2_diagnostico_trs_psiquiatrico_sub_dsm_iv))|>
mutate(x3_dsm_concat= paste0(x3_diagnostico_trs_psiquiatrico_dsm_iv, "_", x3_diagnostico_trs_psiquiatrico_sub_dsm_iv))|>
select(ends_with("dsm_concat")) |>
pivot_longer(
cols = everything(),
names_to = "concat_type",
values_to = "concat_value"
)|>
select(-concat_type)|>
janitor::tabyl(concat_value)|>
data.frame()|>
arrange(desc(n))|>
select(concat_value)|>
#only useful links
filter(!grepl("_NA$", concat_value))|>
filter(!grepl("^NA_", concat_value))|>
filter(!grepl("en estudio", concat_value))|>
filter(!grepl("sin trastorno", concat_value))|>
tidyr::separate(concat_value, into = c("main", "sub"), sep = "_")
if(dg_trs_psiq_dsm_iv_sub_tab |>
group_by(sub) |>
summarise(main=n_distinct(main)) |>
filter(main>1) |> nrow()<0){stop("more than one main category in one sub diagnostic")}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("to standardize the main category with the DSM-IV subcategory\n")
dg_trs_psiq_icd10_sub_tab<-
SISTRAT23_c1_2010_2022_df_prev1n|>
mutate(icd_concat= paste0(diagnostico_trs_psiquiatrico_cie_10, "_", diagnostico_trs_psiquiatrico_sub_cie_10))|>
mutate(x2_icd_concat= paste0(x2_diagnostico_trs_psiquiatrico_cie_10, "_", x2_diagnostico_trs_psiquiatrico_sub_cie_10))|>
mutate(x3_icd_concat= paste0(x3_diagnostico_trs_psiquiatrico_cie_10, "_", x3_diagnostico_trs_psiquiatrico_sub_cie_10))|>
select(ends_with("icd_concat")) |>
pivot_longer(
cols = everything(),
names_to = "concat_type",
values_to = "concat_value"
)|>
select(-concat_type)|>
janitor::tabyl(concat_value)|>
data.frame()|>
arrange(desc(n))|>
select(concat_value)|>
#only useful links
filter(!grepl("_NA$", concat_value))|>
filter(!grepl("^NA_", concat_value))|>
filter(!grepl("en estudio", concat_value))|>
filter(!grepl("sin trastorno", concat_value))|>
tidyr::separate(concat_value, into = c("main", "sub"), sep = "_")
if(dg_trs_psiq_icd10_sub_tab |>
group_by(sub) |>
summarise(main=n_distinct(main)) |>
filter(main>1) |> nrow()<0){stop("more than one main category in one sub diagnostic")}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("remove redundancies and duplicates in diagnoses\n")
#_______________#_____________________#______________#
#_______________#_____________________#______________#
#_______________#_____________________#______________#
invisible("Put placeholder to make replacements")
SISTRAT23_c1_2010_2022_df_prev1n_mod1<-
SISTRAT23_c1_2010_2022_df_prev1n|>
mutate(dg_trs_psiq_dsm_iv_or = case_when(
!is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv) &
diagnostico_trs_psiquiatrico_sub_dsm_iv != "en estudio" &
diagnostico_trs_psiquiatrico_sub_dsm_iv != "sin trastorno" ~
# Perform the lookup here
dg_trs_psiq_dsm_iv_sub_tab$main[match(diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
TRUE ~ diagnostico_trs_psiquiatrico_dsm_iv # Keep the original value otherwise
#to explore differences and origin
))|> #filter(is.na(dg_trs_psiq_dsm_iv_or), !is.na(diagnostico_trs_psiquiatrico_dsm_iv)) |> select(c("hash_key","dg_trs_psiq_dsm_iv_or","dg_trs_psiq_sub_dsm_iv_or", any_of(names_dg_dsmiv)))|> View()
mutate(x2_dg_trs_psiq_dsm_iv_or = case_when(
!is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
x2_diagnostico_trs_psiquiatrico_sub_dsm_iv != "en estudio" &
x2_diagnostico_trs_psiquiatrico_sub_dsm_iv != "sin trastorno" &
x2_diagnostico_trs_psiquiatrico_sub_dsm_iv != "NA_placeholder" ~
dg_trs_psiq_dsm_iv_sub_tab$main[match(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
TRUE ~ x2_diagnostico_trs_psiquiatrico_dsm_iv
))|> #filter(x2_dg_trs_psiq_dsm_iv_or!=x2_diagnostico_trs_psiquiatrico_dsm_iv) |> select(c("hash_key","x2_dg_trs_psiq_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
mutate(x3_dg_trs_psiq_dsm_iv_or = case_when(
!is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
x3_diagnostico_trs_psiquiatrico_sub_dsm_iv != "en estudio" &
x3_diagnostico_trs_psiquiatrico_sub_dsm_iv != "sin trastorno" ~
dg_trs_psiq_dsm_iv_sub_tab$main[match(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv, dg_trs_psiq_dsm_iv_sub_tab$sub)],
TRUE ~ x3_diagnostico_trs_psiquiatrico_dsm_iv
))|> #filter(x3_dg_trs_psiq_dsm_iv_or!=x3_diagnostico_trs_psiquiatrico_dsm_iv) |> select(c("hash_key","x3_dg_trs_psiq_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#now with ICD-10 classifications
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
mutate(dg_trs_psiq_cie_10_or = case_when(
!is.na(diagnostico_trs_psiquiatrico_sub_cie_10) &
diagnostico_trs_psiquiatrico_sub_cie_10 != "en estudio" &
diagnostico_trs_psiquiatrico_sub_cie_10 != "sin trastorno" ~
dg_trs_psiq_icd10_sub_tab$main[match(diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
TRUE ~ diagnostico_trs_psiquiatrico_cie_10
))|> #filter(dg_trs_psiq_cie_10_or!=diagnostico_trs_psiquiatrico_cie_10) |> select(c("hash_key","dg_trs_psiq_cie_10_or", any_of(names_dg_icd10)))|> glimpse()
mutate(x2_dg_trs_psiq_cie_10_or = case_when(
!is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10) &
x2_diagnostico_trs_psiquiatrico_sub_cie_10 != "en estudio" &
x2_diagnostico_trs_psiquiatrico_sub_cie_10 != "sin trastorno" ~
dg_trs_psiq_icd10_sub_tab$main[match(x2_diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
TRUE ~ x2_diagnostico_trs_psiquiatrico_cie_10
))|> #filter(x2_dg_trs_psiq_cie_10_or!=x2_diagnostico_trs_psiquiatrico_cie_10) |> select(c("hash_key","x2_dg_trs_psiq_cie_10_or", any_of(names_dg_icd10)))|> glimpse()
mutate(x3_dg_trs_psiq_cie_10_or = case_when(
!is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10) &
x3_diagnostico_trs_psiquiatrico_sub_cie_10 != "en estudio" &
x3_diagnostico_trs_psiquiatrico_sub_cie_10 != "sin trastorno" ~
dg_trs_psiq_icd10_sub_tab$main[match(x3_diagnostico_trs_psiquiatrico_sub_cie_10, dg_trs_psiq_icd10_sub_tab$sub)],
TRUE ~ x3_diagnostico_trs_psiquiatrico_cie_10
))|> #filter(x3_dg_trs_psiq_cie_10_or!=x3_diagnostico_trs_psiquiatrico_cie_10) |> select(c("hash_key","x3_dg_trs_psiq_cie_10_or", any_of(names_dg_icd10)))|> glimpse()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#in case of main dg but no sub-dg, DSM-IV: this empty field will be respected in the future in case replacing diagnoses
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
mutate(dg_trs_psiq_sub_dsm_iv_or = case_when(
is.na(diagnostico_trs_psiquiatrico_sub_dsm_iv) &
!is.na(dg_trs_psiq_dsm_iv_or) &
diagnostico_trs_psiquiatrico_dsm_iv != "en estudio" &
diagnostico_trs_psiquiatrico_dsm_iv != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
TRUE ~ diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
))|> #filter(dg_trs_psiq_sub_dsm_iv_or=="NA_placeholder") |> select(c("hash_key","dg_trs_psiq_sub_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
mutate(x2_dg_trs_psiq_sub_dsm_iv_or = case_when(
is.na(x2_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
!is.na(x2_dg_trs_psiq_dsm_iv_or) &
x2_diagnostico_trs_psiquiatrico_dsm_iv != "en estudio" &
x2_diagnostico_trs_psiquiatrico_dsm_iv != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
TRUE ~ x2_diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
))|> #filter(x2_dg_trs_psiq_sub_dsm_iv_or=="NA_placeholder") |> select(c("hash_key","x2_dg_trs_psiq_sub_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
mutate(x3_dg_trs_psiq_sub_dsm_iv_or = case_when(
is.na(x3_diagnostico_trs_psiquiatrico_sub_dsm_iv) &
!is.na(x3_dg_trs_psiq_dsm_iv_or) &
x3_diagnostico_trs_psiquiatrico_dsm_iv != "en estudio" &
x3_diagnostico_trs_psiquiatrico_dsm_iv != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
TRUE ~ x3_diagnostico_trs_psiquiatrico_sub_dsm_iv # Keep the original value otherwise
))|> #filter(x3_dg_trs_psiq_sub_dsm_iv_or=="NA_placeholder") |> select(c("hash_key","x3_dg_trs_psiq_sub_dsm_iv_or", any_of(names_dg_dsmiv)))|> glimpse()
mutate(dg_trs_psiq_sub_cie_10_or = case_when(
is.na(diagnostico_trs_psiquiatrico_sub_cie_10) &
!is.na(dg_trs_psiq_cie_10_or) &
diagnostico_trs_psiquiatrico_cie_10 != "en estudio" &
diagnostico_trs_psiquiatrico_cie_10 != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
TRUE ~ diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
))|>
mutate(x2_dg_trs_psiq_sub_cie_10_or = case_when(
is.na(x2_diagnostico_trs_psiquiatrico_sub_cie_10) &
!is.na(x2_dg_trs_psiq_cie_10_or) &
x2_diagnostico_trs_psiquiatrico_cie_10 != "en estudio" &
x2_diagnostico_trs_psiquiatrico_cie_10 != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
TRUE ~ x2_diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
))|>
mutate(x3_dg_trs_psiq_sub_cie_10_or = case_when(
is.na(x3_diagnostico_trs_psiquiatrico_sub_cie_10) &
!is.na(x3_dg_trs_psiq_cie_10_or) &
x3_diagnostico_trs_psiquiatrico_cie_10 != "en estudio" &
x3_diagnostico_trs_psiquiatrico_cie_10 != "sin trastorno" ~ "NA_placeholder", # Or whatever value you want to assign
TRUE ~ x3_diagnostico_trs_psiquiatrico_sub_cie_10 # Keep the original value otherwise
))|> #filter(x3_dg_trs_psiq_sub_cie_10_or=="NA_placeholder") |> select(c("hash_key","x3_dg_trs_psiq_cie_10_or", any_of(names_dg_icd10)))|> glimpse()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Collapse and separate main
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# First create combined diagnosis pairs for comparison
mutate(
# Create diagnosis pairs for comparison
diag_pair_1 = paste(dg_trs_psiq_cie_10_or,
dg_trs_psiq_sub_cie_10_or, sep = "::"),
diag_pair_2 = paste(x2_dg_trs_psiq_cie_10_or,
x2_dg_trs_psiq_sub_cie_10_or, sep = "::"),
diag_pair_3 = paste(x3_dg_trs_psiq_cie_10_or,
x3_dg_trs_psiq_sub_cie_10_or, sep = "::"),
# Now flag duplicates for removal
keep_pair_2 = diag_pair_2 != diag_pair_1,
keep_pair_3 = diag_pair_3 != diag_pair_1 & diag_pair_3 != diag_pair_2
)|>
# Apply the duplicate filtering
mutate(
# Set the filtered columns based on duplicate flags
# prevents the same diagnosis from being counted multiple times when we combine them into the final concatenated field
x2_diag_filtered = if_else(keep_pair_2, x2_dg_trs_psiq_cie_10_or, NA_character_),
x2_subdiag_filtered = if_else(keep_pair_2, x2_dg_trs_psiq_sub_cie_10_or, NA_character_),
x3_diag_filtered = if_else(keep_pair_3, x3_dg_trs_psiq_cie_10_or, NA_character_),
x3_subdiag_filtered = if_else(keep_pair_3, x3_dg_trs_psiq_sub_cie_10_or, NA_character_)
)
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
sub_dsm_iv_to_cie_10_comp_table <- rio::import(paste0(wdpath,"cons/_input/sub_dsm_iv_to_cie_10_comp_table.xlsx"))|>
# minusc, we changed tildes
mutate(across(where(is.character),
~stringi::stri_trans_general(., "Latin-ASCII")))
invisible("Is not very useful to replace DSM-IV for ICD-10 codes. We dont know the source of the homologation and 31 sub-diagnostics are not homologued")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("merge clean diagnoses\n")
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible({
SISTRAT23_c1_2010_2022_df_prev1n_mod1[, c("diag1", "diag2", "diag3") := .(NA_character_, NA_character_, NA_character_)]
SISTRAT23_c1_2010_2022_df_prev1n_mod1[, c("diag1", "diag2", "diag3") := .(
# First diagnosis
ifelse(!is.na(dg_trs_psiq_cie_10_or) &
!dg_trs_psiq_cie_10_or %in% c("en estudio", "sin trastorno"),
paste0(dg_trs_psiq_cie_10_or, "::",
ifelse(!is.na(dg_trs_psiq_sub_cie_10_or),
dg_trs_psiq_sub_cie_10_or, "NA")),
NA_character_),
# Second diagnosis
ifelse(!is.na(x2_diag_filtered) &
!x2_diag_filtered %in% c("en estudio", "sin trastorno"),
paste0(x2_diag_filtered, "::",
ifelse(!is.na(x2_subdiag_filtered),
x2_subdiag_filtered, "NA")),
NA_character_),
# Third diagnosis
ifelse(!is.na(x3_diag_filtered) &
!x3_diag_filtered %in% c("en estudio", "sin trastorno"),
paste0(x3_diag_filtered, "::",
ifelse(!is.na(x3_subdiag_filtered),
x3_subdiag_filtered, "NA")),
NA_character_)
)]
SISTRAT23_c1_2010_2022_df_prev1n_mod1[, mod_psiq_cie_10_or := {
tmp <- na.omit(c(diag1, diag2, diag3))
ifelse(length(tmp) > 0, paste(tmp, collapse = "; "), NA_character_)
}, by=.I] #to operate by row
})
invisible("Function that may work well in the future (step 4)")
# Custom function to extract components in one pass
# SISTRAT23_c1_2010_2022_df_prev1n_mod1[, c(
# "dg_cie_10_main_1", "dg_cie_10_sub_1",
# "dg_cie_10_main_2", "dg_cie_10_sub_2",
# "dg_cie_10_main_3", "dg_cie_10_sub_3"
# ) := {
# parts <- strsplit(mod_cie_10_or, "\\s*;\\s*")[[1]]
#
# # Initialize with NAs
# result <- rep(NA_character_, 6)
#
# # Parse up to 3 diagnoses
# for (i in 1:min(length(parts), 3)) {
# if (!is.na(parts[i])) {
# subparts <- strsplit(parts[i], "\\s*::\\s*")[[1]]
# result[2*i-1] <- subparts[1]
# result[2*i] <- if (length(subparts) > 1) subparts[2] else NA_character_
# }
# }
# as.list(result)
# }]
# Remove memory
gc()
# Remove temporary columns if needed
invisible({
SISTRAT23_c1_2010_2022_df_prev1n_mod1[, c("diag1", "diag2", "diag3") := NULL]
})
SISTRAT23_c1_2010_2022_df_prev1n_mod2<-
SISTRAT23_c1_2010_2022_df_prev1n_mod1|>
rowwise()|>
# Detect any in study or diagnostic of no disorder detected
mutate(dg_psiq_cie_10_instudy = any(c_across(c(dg_trs_psiq_cie_10_or, x2_diag_filtered, x3_diag_filtered)) %in% c("en estudio")))|>
mutate(dg_psiq_cie_10_no_dg = any(c_across(c(dg_trs_psiq_cie_10_or, x2_diag_filtered, x3_diag_filtered)) %in% c("sin trastorno")))|>
# Any diagnostic different than in study or non-detected
mutate(dg_psiq_cie_10_dg = any(!is.na(c_across(c(dg_trs_psiq_cie_10_or, x2_diag_filtered, x3_diag_filtered))) & !(c_across(c(dg_trs_psiq_cie_10_or, x2_diag_filtered, x3_diag_filtered)) %in% c("en estudio", "sin trastorno"))))|>
ungroup()|>
select(-contains("_pair_"), -ends_with("_filtered"))|>
# replace with NA
#select(c("hash_key", contains("cie_10")))|> slice(95:100)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# Collapse and separate main (NOW FOR DSM-IV DIAGNOSES)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# First create combined diagnosis pairs for comparison
mutate(
# Create diagnosis pairs for comparison
diag_pair_1 = paste(dg_trs_psiq_dsm_iv_or,
dg_trs_psiq_sub_dsm_iv_or, sep = "::"),
diag_pair_2 = paste(x2_dg_trs_psiq_dsm_iv_or,
x2_dg_trs_psiq_sub_dsm_iv_or, sep = "::"),
diag_pair_3 = paste(x3_dg_trs_psiq_dsm_iv_or,
x3_dg_trs_psiq_sub_dsm_iv_or, sep = "::"),
# Now flag duplicates for removal
keep_pair_2 = diag_pair_2 != diag_pair_1,
keep_pair_3 = diag_pair_3 != diag_pair_1 & diag_pair_3 != diag_pair_2
)|>
# Apply the duplicate filtering
mutate(
# Set the filtered columns based on duplicate flags
# prevents the same diagnosis from being counted multiple times when we combine them into the final concatenated field
x2_diag_filtered = if_else(keep_pair_2, x2_dg_trs_psiq_dsm_iv_or, NA_character_),
x2_subdiag_filtered = if_else(keep_pair_2, x2_dg_trs_psiq_sub_dsm_iv_or, NA_character_),
x3_diag_filtered = if_else(keep_pair_3, x3_dg_trs_psiq_dsm_iv_or, NA_character_),
x3_subdiag_filtered = if_else(keep_pair_3, x3_dg_trs_psiq_sub_dsm_iv_or, NA_character_)
)
invisible({
SISTRAT23_c1_2010_2022_df_prev1n_mod2[, c("diag1_dsm", "diag2_dsm", "diag3_dsm") := .(
# First diagnosis
ifelse(!is.na(dg_trs_psiq_dsm_iv_or) &
!dg_trs_psiq_dsm_iv_or %in% c("en estudio", "sin trastorno"),
paste0(dg_trs_psiq_dsm_iv_or, "::",
ifelse(!is.na(dg_trs_psiq_sub_dsm_iv_or),
dg_trs_psiq_sub_dsm_iv_or, "NA")),
NA_character_),
# Second diagnosis
ifelse(!is.na(x2_diag_filtered) &
!x2_diag_filtered %in% c("en estudio", "sin trastorno"),
paste0(x2_diag_filtered, "::",
ifelse(!is.na(x2_subdiag_filtered),
x2_subdiag_filtered, "NA")),
NA_character_),
# Third diagnosis
ifelse(!is.na(x3_diag_filtered) &
!x3_diag_filtered %in% c("en estudio", "sin trastorno"),
paste0(x3_diag_filtered, "::",
ifelse(!is.na(x3_subdiag_filtered),
x3_subdiag_filtered, "NA")),
NA_character_)
)]
SISTRAT23_c1_2010_2022_df_prev1n_mod2[, mod_psiq_dsm_iv_or := {
tmp_dsm <- na.omit(c(diag1_dsm, diag2_dsm, diag3_dsm))
ifelse(length(tmp_dsm) > 0, paste(tmp_dsm, collapse = "; "), NA_character_)
}, by=.I] #to operate by row
})
#Remove diag columns
invisible({
SISTRAT23_c1_2010_2022_df_prev1n_mod2[, c("diag1_dsm", "diag2_dsm", "diag3_dsm") := NULL]
})
SISTRAT23_c1_2010_2022_df_prev1n_mod3<-
SISTRAT23_c1_2010_2022_df_prev1n_mod2|>
rowwise()|>
# Detect any in study or diagnostic of no disorder detected
mutate(dg_psiq_dsm_iv_instudy = any(c_across(c(dg_trs_psiq_dsm_iv_or, x2_diag_filtered, x3_diag_filtered)) %in% c("en estudio")))|>
mutate(dg_psiq_dsm_iv_no_dg = any(c_across(c(dg_trs_psiq_dsm_iv_or, x2_diag_filtered, x3_diag_filtered)) %in% c("sin trastorno")))|>
# Any diagnostic different than in study or non-detected
mutate(dg_psiq_dsm_iv_dg = any(!is.na(c_across(c(dg_trs_psiq_dsm_iv_or, x2_diag_filtered, x3_diag_filtered))) & !(c_across(c(dg_trs_psiq_dsm_iv_or, x2_diag_filtered, x3_diag_filtered)) %in% c("en estudio", "sin trastorno"))))|>
ungroup()|>
select(-contains("_pair_"), -ends_with("_filtered"))
# replace with NA
#select(c("hash_key", contains("dsm_iv")))|> slice(95:100)Cases with sub-diagnostics but without the main: DSM-IV
[1] 0
second dg.
[1] 0
third dg.
[1] 3
Cases with sub-diagnostics but without the main: ICD-10
[1] 0
second dg.
[1] 0
third dg.
[1] 0
Cases with sub-diagnostics but without the main: DSM-IV
[1] 0
second dg.
[1] 0
third dg.
[1] 3
Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno): ICD-10
[1] 3
second dg.
[1] 0
third dg.
[1] 0
Cases with sub-diagnostics but without the main, or the main in study or with explicit non-classification (sin trastorno): DSM-IV
[1] 2
second dg.
[1] 0
third dg.
[1] 4
to standardize the main category with the DSM-IV subcategory
to standardize the main category with the DSM-IV subcategory
remove redundancies and duplicates in diagnoses
merge clean diagnoses
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4556962 243.4 7364831 393.4 7364831 393.4
Vcells 657570067 5016.9 1027311897 7837.8 823777985 6285.0
1.2. Ethnicity
To generate a more inclusive approach to ethnic identification and assuming that ethnicity is invariable per person, all ethnicity records associated with each individual were consolidated from the original dataset (SISTRAT23_c1_2010_2022_df), preserving the diversity of self-identifications through semicolon-separated values. We excluded non-reported ethnicity data (inclusive_historical_ethnicity_by_run). We also added ethnicity data from C2 to C6. This variable is called ethnicity_c1_c6_historic. For more inclusion, we added information of ethnicity of databases of 2022 to 2024.
Code
inclusive_historical_ethnicity_by_run<-
SISTRAT23_c1_2010_2022_df |>
filter(etnia!="no pertenece", !is.na(etnia))|>
group_by(hash_key) |>
summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
ungroup() #|>filter(grepl(";",etnias_distinct))
inclusive_historical_ethnicity_by_run_2324<-
SISTRAT23_c1_2023_2024_df |>
filter(etnia!="no pertenece", !is.na(etnia))|>
group_by(hashkey) |>
summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
ungroup() #|>filter(grepl(";",etnias_distinct))
c2_inclusive_historical_ethnicity_by_run<-
CONS_C2 |>
filter(etnia!="no pertenece", !is.na(etnia)) |>
group_by(HASH_KEY) |>
summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
ungroup() #|> filter(grepl(";",etnias_distinct))
c2_inclusive_historical_ethnicity_by_run_2324<-
c2_2324 |>
filter(etnia!="No pertenece", !is.na(etnia)) |>
group_by(hashkey) |>
summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
ungroup() #|> filter(grepl(";",etnias_distinct))
c2_inclusive_historical_ethnicity_by_run_2324$etnias_distinct <- tolower(c2_inclusive_historical_ethnicity_by_run_2324$etnias_distinct)
c3_inclusive_historical_ethnicity_by_run<-
CONS_C3 |>
filter(etnia!="no pertenece", !is.na(etnia)) |>
group_by(HASH_KEY) |>
summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
ungroup() #|> filter(grepl(";",etnias_distinct))
c4_inclusive_historical_ethnicity_by_run<-
CONS_C4 |>
filter(etnia!="no pertenece", !is.na(etnia)) |>
group_by(HASH_KEY) |>
summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
ungroup() #|> filter(grepl(";",etnias_distinct))
c5_inclusive_historical_ethnicity_by_run<-
CONS_C5 |>
filter(etnia!="no pertenece", !is.na(etnia)) |>
group_by(HASH_KEY) |>
summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
ungroup() #|> filter(grepl(";",etnias_distinct))
c6_inclusive_historical_ethnicity_by_run<-
if(filter(CONS_C6, paisnacimiento=="no pertenece") |> nrow()>0){
CONS_C6 |>
filter(paisnacimiento!="no pertenece", !is.na(paisnacimiento), paisnacimiento!="chile") |>
group_by(HASH_KEY) |>
summarise(etnias_distinct = paste(unique(paisnacimiento), collapse = "; ")) |>
ungroup() #|> filter(grepl(";",etnias_distinct))
} else {
CONS_C6 |>
filter(etnia!="no pertenece", !is.na(etnia), etnia!="chile") |>
group_by(HASH_KEY) |>
summarise(etnias_distinct = paste(unique(etnia), collapse = "; ")) |>
ungroup() #|> filter(grepl(";",etnias_distinct))
}
SISTRAT23_c1_2010_2022_df_prev1o<-
SISTRAT23_c1_2010_2022_df_prev1n_mod3|>
(\(df) {
cat(paste0("5.Number of cases after normalization of data editing: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("5.Number of patients after normalization of data editing: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
left_join(inclusive_historical_ethnicity_by_run_2324, by=c("hash_key"="hashkey"), multiple="first")|>
rename("ethnicity_inclusive2324"="etnias_distinct")|>
left_join(inclusive_historical_ethnicity_by_run, by="hash_key", multiple="first")|>
rename("ethnicity_inclusive"="etnias_distinct")|>
left_join(c2_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
rename("ethnicity_inclusive_c2"="etnias_distinct")|>
left_join(c2_inclusive_historical_ethnicity_by_run_2324, by=c("hash_key"="hashkey"), multiple="first")|>
rename("ethnicity_inclusive_c2_2224"="etnias_distinct")|>
left_join(c3_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
rename("ethnicity_inclusive_c3"="etnias_distinct")|>
left_join(c4_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
rename("ethnicity_inclusive_c4"="etnias_distinct")|>
left_join(c5_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
rename("ethnicity_inclusive_c5"="etnias_distinct")|>
left_join(c6_inclusive_historical_ethnicity_by_run, by=c("hash_key"="HASH_KEY"), multiple="first")|>
rename("ethnicity_inclusive_c6"="etnias_distinct")|>
(\(df) {
cat(paste0("5. After normalization and data editing, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("5. After normalization and data editing, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1n))stop("Error: Added treatment episodes in the process")
df
})()
SISTRAT23_c1_2010_2022_df_prev1o<-
# First, split each ethnicity column by "; ", extract unique values, and combine into a new column
SISTRAT23_c1_2010_2022_df_prev1o|>
rowwise()|>
mutate(
ethnicity_c1_c6_historic = {
# Get all columns starting with "ethnicity_inclusive_"
# Use the names of the dataframe directly
all_cols <- names(SISTRAT23_c1_2010_2022_df_prev1o)
eth_cols <- c("ethnicity_inclusive",
grep("^ethnicity_inclusive_", all_cols, value = TRUE))
# Extract values from these columns that exist
eth_values <- c()
for (col in eth_cols) {
if (col %in% all_cols) {
val <- get(col)
if (!is.na(val)) eth_values <- c(eth_values, val)
}
}
# Split each value by semicolon and flatten
if (length(eth_values) > 0) {
all_eth <- unlist(strsplit(eth_values, "\\s*;\\s*"))
paste(unique(all_eth), collapse = "; ")
} else {
NA_character_
}
}
)|>
ungroup()|>
select(-any_of((starts_with("ethnicity_inclusive"))))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#table(SISTRAT23_c1_2010_2022_df_prev1o$tipo_centro_derivacion, SISTRAT23_c1_2010_2022_df_prev1o$motivo_de_egreso)
#table(SISTRAT23_c1_2010_2022_df_prev1o$motivo_de_egreso_alta_administrativa, SISTRAT23_c1_2010_2022_df_prev1o$motivo_de_egreso)5.Number of cases after normalization of data editing: 150,046
5.Number of patients after normalization of data editing: 106,283
5. After normalization and data editing, obs.: 150,046
5. After normalization and data editing, RUNs: 106,283
2. More than One Value within User, Concerning User-Invariant Variables
We need to obtain sociodemographic categories that are usually invariant for a given individual. Although this assumption is highly debatable, it allows us to detect inequalities stemming from these distinctions and their associations with social roles and stigmatization. For this purpose, we used external databases linked to SENDA agreements 2 through 6, together with hospitalization and Prosecutor’s Office databases.
Code
knitr::include_graphics(paste0(wdpath,"cons/_figs/same_hash_distinct_values_user_invariant.svg"))2.1. Sex
- Sex (sexo_2) (patients= 500). If there were only two observations with distinct sexes but in the same yearly dataset, we replaced the sex to “women” if the user has ever been in a type of program specifically for women, or if the user has ever been pregnant.
The primary approach differs depending on the availability of external data (count_not_na). The system first checks for clear agreement between internal (c1_perc_mujer) and external (perc_fem_ext) data. If both sources indicate a strong majority (>50%) for the same sex, that sex is assigned (Cases 6.a.a.1, 6.a.a.2). In cases of disagreement or ambiguity (e.g., one source shows a tie at 50%, or sources point to different sexes), the decision relies on comparing the quantity of records in each source (total vs. count_not_na). Generally, the source with more records is given higher weight.
When Only Internal Data is Available (count_not_na == 0), the decision relies solely on the internal data proportion (c1_perc_mujer) and the total number of internal records (total), and consider ties.
Code
decision_sex_inconsistencies<-
DiagrammeR::grViz("
digraph decision_tree {
graph [rankdir = TB, nodesep = 0.5]
node [fontname = Helvetica, shape = diamond, width = 3.5, height = 1.2]
edge [fontname = Helvetica]
# Start node
start [label = 'Start', shape = oval]
# Main branches
node0 [label = 'External data exists?\n(count_not_na > 0)']
# ========== EXTERNAL DATA AVAILABLE (6.a) ==========
subgraph cluster_external {
label = 'External Data Available (count_not_na > 0)'
color = blue
node1 [label = 'Both internal & external >50% female?\n(c1_perc_mujer > 0.5 & perc_fem_ext > 0.5)']
node2 [label = 'Both internal & external <50% female?\n(c1_perc_mujer < 0.5 & perc_fem_ext < 0.5)']
node3 [label = 'Internal tie & external female majority &\nExternal data > internal?']
node4 [label = 'Internal tie & external male majority &\nExternal data > internal?']
node5 [label = 'Internal female majority & external tie &\nInternal data > external?']
node6 [label = 'Internal male majority & external tie &\nInternal data > external?']
node7 [label = 'Internal tie & external female majority &\nInternal data > external?']
node8 [label = 'Internal tie & external male majority &\nInternal data > external?']
node9 [label = 'External majority female & internal <50% &\nExternal data > internal?']
node10 [label = 'External majority male & internal >=50% &\nExternal data > internal?']
node11 [label = 'External tie & internal female majority &\nExternal data > internal?']
node12 [label = 'External tie & internal male majority &\nExternal data > internal?']
node13 [label = 'Conflicting majorities &\nEqual data quantity']
node14 [label = 'Tie in both internal &\nexternal data']
# Outcomes
outcome1 [label = '6.a.a.1.female', shape = box]
outcome2 [label = '6.a.a.2.male', shape = box]
outcome3 [label = '6.a.b.1.female', shape = box]
outcome4 [label = '6.a.b.2.male', shape = box]
outcome5 [label = '6.a.b.1.female', shape = box]
outcome6 [label = '6.a.b.2.male', shape = box]
outcome7 [label = '6.a.b.1.female, but ask aux data', shape = box]
outcome8 [label = '6.a.b.2.male, but ask aux data', shape = box]
outcome9 [label = '6.a.b.1.female', shape = box]
outcome10 [label = '6.a.b.2.male', shape = box]
outcome11 [label = '6.a.b.1.female, but ask aux data', shape = box]
outcome12 [label = '6.a.b.2.male, but ask aux data', shape = box]
outcome13 [label = '6.a.b.3.nondet', shape = box]
outcome14 [label = '6.a.c.nondet', shape = box]
# Connections
node0 -> node1 [label = 'Yes']
node1 -> outcome1 [label = 'Yes']
node1 -> node2 [label = 'No']
node2 -> outcome2 [label = 'Yes']
node2 -> node3 [label = 'No']
node3 -> outcome3 [label = 'Yes']
node3 -> node4 [label = 'No']
node4 -> outcome4 [label = 'Yes']
node4 -> node5 [label = 'No']
node5 -> outcome5 [label = 'Yes']
node5 -> node6 [label = 'No']
node6 -> outcome6 [label = 'Yes']
node6 -> node7 [label = 'No']
node7 -> outcome7 [label = 'Yes']
node7 -> node8 [label = 'No']
node8 -> outcome8 [label = 'Yes']
node8 -> node9 [label = 'No']
node9 -> outcome9 [label = 'Yes']
node9 -> node10 [label = 'No']
node10 -> outcome10 [label = 'Yes']
node10 -> node11 [label = 'No']
node11 -> outcome11 [label = 'Yes']
node11 -> node12 [label = 'No']
node12 -> outcome12 [label = 'Yes']
node12 -> node13 [label = 'No']
node13 -> outcome13 [label = 'Yes']
node13 -> node14 [label = 'No']
node14 -> outcome14 [label = 'Yes']
}
# ========== NO EXTERNAL DATA (6.b) ==========
subgraph cluster_internal {
label = 'No External Data (count_not_na == 0)'
color = green
node15 [label = 'Total even?\n(total in 1-40 records\neven numbers)']
node16 [label = 'Internal majority female?\n(c1_perc_mujer > 0.5)']
node17 [label = 'Internal majority male?\n(c1_perc_mujer < 0.5)']
node18 [label = 'Total odd?']
# Outcomes
outcome15 [label = '6.b.a.1.female', shape = box]
outcome16 [label = '6.b.a.2.male', shape = box]
outcome17 [label = '6.b.a.3.nondet', shape = box]
outcome18 [label = '6.b.b.1.female', shape = box]
outcome19 [label = '6.b.b.1.male', shape = box]
# Connections
node0 -> node15 [label = 'No']
node15 -> node16 [label = 'Yes']
node15 -> node18 [label = 'No']
node16 -> outcome15 [label = 'Yes']
node16 -> node17 [label = 'No']
node17 -> outcome16 [label = 'Yes']
node17 -> outcome17 [label = 'No']
node18 -> outcome18 [label = 'c1_perc_mujer >= 0.5']
node18 -> outcome19 [label = 'c1_perc_mujer < 0.5']
}
start -> node0
}
",
width = 1200,
height = 900
)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
# WidthCM<-8
# HeightCM<-6
# DPI<-600
unlink(paste0(wdpath,"cons/_figs/decision_sex_inconsistencies_files"), recursive = TRUE)
htmlwidgets::saveWidget(decision_sex_inconsistencies, paste0(wdpath,"cons/_figs/decision_sex_inconsistencies.html"))
webshot::webshot(paste0(wdpath,"cons/_figs/decision_sex_inconsistencies.html"),paste0(wdpath,"cons/_figs/decision_sex_inconsistencies.png"), vwidth = 300*1.2, vheight = 300, zoom=10, expand=100) # Prueba con diferentes coordenadas top, left, width, and height.[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Code
invalid_sex_by_patient<-
SISTRAT23_c1_2010_2022_df_prev1o|> group_by(hash_key)|> summarise(sexo_por_hash = n_distinct(sexo), miss_sexo = sum(is.na(sexo), na.rm=T), tot_obs = n())|> ungroup()|> mutate(perc_miss_sexo = miss_sexo/tot_obs)|> filter(sexo_por_hash>1|perc_miss_sexo==1)|> pull(hash_key)
invisible("======================================================")
invalid_sex_hashs_hosp<-
HOSP_filter_df|>
filter(run %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("Hospital, Entries: ", nrow(df))))
print(message(paste0("Hospital, RUNs: ", distinct(df, run) |> nrow())))
df
})()|>
distinct(run, sexo)|>
group_by(run)|>
mutate(id = as.character(dplyr::row_number()))|>
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "h_sex_")Code
# Hospital, Entries: 1656
# NULL
# Hospital, RUNs: 371
# NULL
invisible("======================================================")
invalid_sex_top<-
SISTRAT23_top_2015_2022_df|>
filter(HASH_KEY %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("TOP, Entries: ", nrow(df))))
print(message(paste0("TOP, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sexo)|>
#select(HASH_KEY, sexo)|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "t_sex_")Code
# TOP, Entries: 1518
# NULL
# TOP, RUNs: 310
# NULL
invisible("======================================================")
invalid_sex_c2<-
CONS_C2 |>
filter(HASH_KEY %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("C2, Entries: ", nrow(df))))
print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sexo)|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "c2_sex_")Code
# C2, Entries: 0
# NULL
# C2, RUNs: 0
# NULL
invisible("======================================================")
invalid_sex_c3<-
CONS_C3|>
filter(HASH_KEY %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("C3, Entries: ", nrow(df))))
print(message(paste0("C3, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sexo)|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "c3_sex_")Code
# C3, Entries: 4
# NULL
# C3, RUNs: 4
# NULL
invisible("======================================================")
invalid_sex_c4<-
CONS_C4 |>
filter(HASH_KEY %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("C4, Entries: ", nrow(df))))
print(message(paste0("C4, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sexo) |>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "c4_sex_")Code
# C4, Entries: 2
# NULL
# C4, RUNs: 1
# NULL
invisible("======================================================")
invalid_sex_c5<-
CONS_C5 |>
filter(HASH_KEY %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("C5, Entries: ", nrow(df))))
print(message(paste0("C5, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})() |>
distinct(HASH_KEY, sexo)|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "c5_sex_")Code
# C5, Entries: 1
# NULL
# C5, RUNs: 1
# NULL
invisible("======================================================")
invalid_sex_c6<-
CONS_C6 |>
filter(HASH_KEY %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("C6, Entries: ", nrow(df))))
print(message(paste0("C6, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sexo)|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "c6_sex_")Code
# C6, Entries: 1
# NULL
# C6, RUNs: 1
# NULL
invisible("======================================================")
invalid_sex_mortality<-
mortality |>
filter(hashkey %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("Mortality, Entries: ", nrow(df))))
print(message(paste0("Mortality, RUNs: ", distinct(df, hashkey)|> nrow())))
df
})()|>
distinct(hashkey, sexo)|>
ungroup()|>
rename("m_sexo"="sexo")Code
# Mortality, Entries: 15
# NULL
# Mortality, RUNs: 15
# NULL
invisible("======================================================")
invalid_sex_may23_PO_office<-
OLD_NEW_SISTRAT23_c1_2010_2022_df2|>
tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|>
select("HASH_KEY.x","HASH_KEY.y", "sexo.y")|>
filter(HASH_KEY.x %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("PO Office, Entries: ", nrow(df))))
print(message(paste0("PO Office, RUNs: ", distinct(df, HASH_KEY.x) |> nrow())))
df
})()|>
group_by(HASH_KEY.x)|>
summarise(femenino = sum(grepl("FEM", sexo.y)),masculino = sum(grepl("MASC", sexo.y)), total=n())|>
ungroup()|>
mutate(po_perc_fem = femenino / total, po_perc_masc = masculino / total)|>
filter(po_perc_masc<.5|po_perc_fem>.5)|>
(\(df) {
print(message(paste0("PO Office, only clear sexes, RUNs: ", distinct(df, HASH_KEY.x) |> nrow())))
df
})()Code
# PO Office, Entries: 18205
# NULL
# PO Office, RUNs: 452
# NULL
# PO Office, only clear sexes, RUNs: 282
# NULL
invisible("======================================================")
invalid_sex_c1_2324<-
SISTRAT23_c1_2023_2024_df2|>
filter(hash_key %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("C1, 23-24, Entries: ", nrow(df))))
print(message(paste0("C1, 23-24, RUNs: ", distinct(df, hash_key)|> nrow())))
df
})()|>
distinct(hash_key, sexo)|>
ungroup()|>
group_by(hash_key)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "c1_2324_sex_")Code
invisible("======================================================")
invalid_sex_c1_2324_idgen<-
SISTRAT23_c1_2023_2024_df2|>
filter(hash_key %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("C1, 23-24, Gender identity, Entries: ", nrow(df))))
print(message(paste0("C1, 23-24, Gender identity, RUNs: ", distinct(df, hash_key)|> nrow())))
df
})()|>
distinct(hash_key, identidad_de_genero)|>
ungroup()|>
group_by(hash_key)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = identidad_de_genero,
names_prefix = "c1_2324_genid_")Code
invisible("======================================================")
invalid_sex_top_2224<-
top_2224|>
filter(hashkey %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("TOP, 23-24, Entries: ", nrow(df))))
print(message(paste0("TOP, 23-24, RUNs: ", distinct(df, hashkey)|> nrow())))
df
})()|>
distinct(hashkey, sexo)|>
ungroup()|>
group_by(hashkey)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "top_2224_sex_")Code
invisible("======================================================")
invalid_sex_c2_2224<-
c2_2324 |>
filter(hashkey %in% invalid_sex_by_patient)|>
(\(df) {
print(message(paste0("C2, 22-24, Entries: ", nrow(df))))
print(message(paste0("C2, 22-24, RUNs: ", distinct(df, hashkey)|> nrow())))
df
})()|>
distinct(hashkey, sexo)|>
ungroup()|>
group_by(hashkey)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = sexo,
names_prefix = "c2_2224_sex_")Code
# C2, 22-24, Entries: 0
# NULL
# C2, 22-24, RUNs: 0
# NULL
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invalid_sex_ext_info<-
SISTRAT23_c1_2010_2022_df_prev1o|>
tidytable::filter(hash_key %in% invalid_sex_by_patient)|>
select(hash_key, sexo)|>
group_by(hash_key)|>
summarise(hombre = sum(grepl("hom", sexo)),mujer = sum(grepl("muj", sexo)), total=n())|>
ungroup()|>
mutate(c1_perc_hombre = hombre / total, c1_perc_mujer = mujer / total)|>
select(hash_key, c1_perc_hombre, c1_perc_mujer, total)|>
tidylog::left_join(invalid_sex_hashs_hosp, by=c("hash_key"="run"), multiple="first")|>
tidytable::select(hash_key, c1_perc_hombre, c1_perc_mujer, total, h_sex_1, h_sex_2)|>
tidylog::left_join(invalid_sex_top, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_sex_c2, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_sex_c3, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_sex_c4, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_sex_c5, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_sex_c6, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_sex_mortality, by=c("hash_key"="hashkey"), multiple="first")|>
tidylog::left_join(invalid_sex_may23_PO_office[,c("HASH_KEY.x","po_perc_fem", "po_perc_masc")], by=c("hash_key"="HASH_KEY.x"), multiple="first")|>
tidylog::left_join(invalid_sex_c1_2324, by=c("hash_key"="hash_key"), multiple="first")|>
tidylog::left_join(invalid_sex_top_2224, by=c("hash_key"="hashkey"), multiple="first")|>
tidylog::left_join(invalid_sex_c2_2224, by=c("hash_key"="hashkey"), multiple="first")|>
(\(df) {
print(message(paste0("Invalid sex that have at least one external sex, Entries: ", nrow(df))))
print(message(paste0("Invalid sex that have at least one external sex, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
df
})()|>
(\(df) {
columns <- c("h_sex_1", "h_sex_2", "t_sex_1", "t_sex_2", "c3_sex_1",
"c4_sex_1", "c5_sex_1", "c6_sex_1", "m_sexo", "po_perc_fem", "po_perc_masc", "c1_2324_sex_1", "c1_2324_sex_2", "top_2224_sex_1")
mutate(df, count_not_na = rowSums(!is.na(select(df, all_of(columns)))))
})()|>
rowwise()|>
mutate(count_fem = sum(h_sex_1==2,
h_sex_2==2, #amaru= mutate(SEX= factor(SEXO, levels= c(1, 2), labels= c("male", "female")))
t_sex_1=="mujer",
t_sex_2=="mujer",
c3_sex_1=="mujer",
c4_sex_1=="mujer",
c5_sex_1=="femenino",
c6_sex_1=="mujer",
m_sexo==2,
po_perc_fem>.5,
c1_2324_sex_1=="mujer",
c1_2324_sex_2=="mujer",
top_2224_sex_1=="Mujer", na.rm=T))|>
ungroup()|>
mutate(perc_fem_ext= count_fem/count_not_na)Code
invisible("No tiene info ext")
#filter(invalid_sex_ext_info, count_not_na==0) |> nrow() #22 # june 2025 22
invisible("No tiene info ext, sólo 2 obs")
#filter(invalid_sex_ext_info, count_not_na==0, total==2) |> nrow() #11 #11
invisible("No tiene info ext, empate")
#filter(invalid_sex_ext_info, count_not_na==0, c1_perc_hombre==c1_perc_mujer) |> nrow() #11, #11 lo mismo
invalid_sex_ext_info_post<-
invalid_sex_ext_info |> #perfect and accords
mutate(decision = case_when(count_not_na>0 & perc_fem_ext>.5 & c1_perc_mujer>.5 ~ "6.a.a.1.female",
count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext<.5~ "6.a.a.2.male",
#less straight evidence, but more ext info to think is a woman
count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext>.5 & total< count_not_na~ "6.a.b.1.female",
#less straight evidence, but more ext info to think is a man
count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext<.5 & total< count_not_na~ "6.a.b.2.male",
#more int info to think is a woman, external is inconsistent
count_not_na>0 & c1_perc_mujer>.5 & perc_fem_ext==.5 & total> count_not_na~ "6.a.b.1.female",
#more int info to think is a man, external is inconsistent
count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext==.5 & total> count_not_na~ "6.a.b.2.male",
#less straight evidence, but less ext info to think is a woman
count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext>.5 & total> count_not_na~ "6.a.b.1.female, but ask aux data",
#less straight evidence, but less ext info to think is a man
count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext<.5 & total> count_not_na~ "6.a.b.2.male, but ask aux data",
#external data that says that is a female is more frequent
count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext>=.5 & total< count_not_na~ "6.a.b.1.female",
#external data that says that is a male is more frequent
count_not_na>0 & c1_perc_mujer>=.5 & perc_fem_ext<.5 & total< count_not_na~ "6.a.b.2.male",
#external data is inconsistent, but C1 contains more data and points out to woman
count_not_na>0 & c1_perc_mujer>.5 & perc_fem_ext==.5 & total< count_not_na~ "6.a.b.1.female, but ask aux data",
#external data is inconsistent, but C1 contains more data and points out to man
count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext==.5 & total< count_not_na~ "6.a.b.2.male, but ask aux data",
#external data, same frequency of C1 data; inconsistent probabilities
count_not_na>0 & c1_perc_mujer>=.5 & perc_fem_ext<.5 & total== count_not_na~ "6.a.b.3.nondet",
#external data, same frequency of C1 data; C1 is inconsistent, but external points out to woman
count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext>.5 & total ==count_not_na~ "6.a.b.1.female, but ask aux data",
#external data, same frequency of C1 data; C1 is inconsistent, but external points out to woman
count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext<.5 & total ==count_not_na~ "6.a.b.2.male, but ask aux data",
#external data, same frequency of C1 data; C1 is points out to woman, but external is inconsistent
count_not_na>0 & c1_perc_mujer>.5 & perc_fem_ext==.5 & total ==count_not_na~ "6.a.b.1.female, but ask aux data",
#external data, same frequency of C1 data; C1 is points out to man, but external is inconsistent
count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext==.5 & total ==count_not_na~ "6.a.b.2.male, but ask aux data",
#C1 data that says that is a female is more frequent
count_not_na>0 & c1_perc_mujer>.5 & perc_fem_ext<.5 & total> count_not_na~ "6.a.b.4.female, but ask aux data",
#C1 data that says that is a male is more frequent
count_not_na>0 & c1_perc_mujer<.5 & perc_fem_ext>.5 & total> count_not_na~ "6.a.b.5.male, but ask aux data",
#C1 data which was more frequent had a tie
count_not_na>0 & perc_fem_ext>.5 & c1_perc_mujer==.5 & total> count_not_na~ "6.a.b.6.nondet",
#external data, same percentage, remaining data
count_not_na>0 & c1_perc_mujer==.5 & perc_fem_ext==.5~ "6.a.c.nondet",
#more female records
count_not_na==0 & total%% 2 == 0 & c1_perc_mujer>.5~ "6.b.a.1.female",
#more male records
count_not_na==0 & total%% 2 == 0 & c1_perc_mujer<.5~ "6.b.a.2.male",
#ties in female values
count_not_na==0 & total%% 2 == 0 & c1_perc_mujer==.5~ "6.b.a.3.nondet",
#
count_not_na==0 & total%% 2 != 0 & c1_perc_mujer>=.5~ "6.b.b.1.female",
count_not_na==0 & total%% 2 != 0 & c1_perc_mujer<.5~ "6.b.b.1.male",
T~"no sé"))
table(invalid_sex_ext_info_post$decision) |> data.frame() |> arrange(desc(Freq)) |>
knitr::kable("markdown", caption= "Preliminary solve inconsistencies in sex")NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
| Var1 | Freq |
|---|---|
| 6.a.a.1.female | 139 |
| 6.a.b.1.female | 125 |
| 6.a.c.nondet | 45 |
| 6.a.a.2.male | 43 |
| 6.a.b.2.male, but ask aux data | 38 |
| 6.a.b.1.female, but ask aux data | 35 |
| 6.a.b.2.male | 27 |
| 6.a.b.3.nondet | 17 |
| 6.b.a.3.nondet | 11 |
| 6.b.b.1.male | 10 |
| 6.a.b.5.male, but ask aux data | 5 |
| 6.a.b.4.female, but ask aux data | 3 |
| 6.b.a.2.male | 1 |
| no sé | 1 |
For patients whose sex remained undetermined (n= 73) or ambiguous (n= 81) after the initial classification, we used pregnancy status and program type information to aid in the final determination. We also checked information of this kind in C1 to C6 databases (about pregnancy status). At last, we also used gender identity as a proxy.
To help determine a patient’s sex when it wasn’t clear after using pregnancy status, we created a tool that examines their primary diagnosis code (ICD-10) of patients with hospitalization records. This tool (infer_sex_icd10 function) checks the code against two specific lists: one containing codes strongly linked to female conditions (like pregnancy or female-specific cancers) and another with codes strongly linked to male conditions (like prostate issues or male-specific cancers).
If a patient’s code clearly matches a pattern on only one list, the tool suggests that sex (Female or Male). If the code doesn’t match any specific pattern on either list, or if it somehow matches patterns on both lists (which indicates a potential issue with the code or patterns), the tool flags the case as undetermined based on the diagnosis alone. This flagging signals that we need to rely on other information, such as external data sources or details like pregnancy status, to make the final sex determination.
Code
invalid_sex_ext_info_post_nondet<-invalid_sex_ext_info_post |> filter(grepl("nondet", decision))
invalid_sex_ext_info_post_ask<-invalid_sex_ext_info_post |> filter(grepl("ask", decision))
c1_6_sex_ext_data<-
group_by(subset(SISTRAT23_c1_2010_2022_df_prev1o, hash_key %in% c(invalid_sex_ext_info_post_nondet$hash_key, invalid_sex_ext_info_post_ask$hash_key)), hash_key)|> summarise(n_embarazada= sum(se_trata_de_una_mujer_embarazada=="si", na.rm=T), n_emb_egr= sum(ha_estado_embarazada_egreso=="si", na.rm=T), n_prog_mujeres= sum(grepl("mujeres",tipo_de_programa),na.rm=T),.groups="drop_last") |>
mutate(pregnancy_c2= ifelse(hash_key %in% c(subset(CONS_C2, a_setratadeunamujerembaraza=="si", "HASH_KEY"),subset(CONS_C2, haestadoembarazadaegreso=="si", "HASH_KEY")),1,0))|> mutate(pregnancy_c3= ifelse(hash_key %in% c(subset(CONS_C3, haestadoembarazadaegreso=="si", "HASH_KEY"),subset(CONS_C3, setratadeunamujerembarazad=="si", "HASH_KEY")),1,0))|>
mutate(pregnancy_c4= ifelse(hash_key %in% c(subset(CONS_C4, haestadoembarazadaegreso=="si", "HASH_KEY"),subset(CONS_C4, setratadeunamujerembarazada=="si", "HASH_KEY")),1,0))|>
mutate(pregnancy_c5= ifelse(hash_key %in% c(subset(CONS_C5, embarazo=="si", "HASH_KEY"),subset(CONS_C5, haestadoembarazadaegreso=="si", "HASH_KEY")),1,0))|>
mutate(pregnancy_c6= ifelse(hash_key %in% c(subset(CONS_C6, haestadoembarazadaegreso=="si", "HASH_KEY"),subset(CONS_C6, setratadeunamujerembarazada=="si", "HASH_KEY")),1,0))
#c2 to c6 didnt add info
invalid_sex_ext_info_post_nondet|>
left_join(c1_6_sex_ext_data, by="hash_key")|>
mutate(ext_data_woman= ifelse(n_embarazada>0|n_emb_egr>0|n_prog_mujeres>0,1,0))|>
mutate(ext_data_woman2= ifelse(pregnancy_c2>0|pregnancy_c3>0|pregnancy_c4>0|pregnancy_c5>0|pregnancy_c6>0,1,0))|>
(\(df) {
cat(paste0("Non-determined sex with pregnancy status: ", filter(df, ext_data_woman==1) |> nrow()))
filter(df, ext_data_woman==1) |> pull(hash_key) ->> hashs_invalid_sex_nondet_pregnant
})()Non-determined sex with pregnancy status: 16
Code
invalid_sex_ext_info_post_ask|>
left_join(c1_6_sex_ext_data, by="hash_key")|>
mutate(decision_woman= grepl("female",decision), ext_data_woman= ifelse(n_embarazada>0|n_emb_egr>0|n_prog_mujeres>0,1,0))|>
mutate(ext_data_woman2= ifelse(pregnancy_c2>0|pregnancy_c3>0|pregnancy_c4>0|pregnancy_c5>0|pregnancy_c6>0,1,0))|>
#janitor::tabyl(decision_woman, ext_data_woman)|>
(\(df) {
cat(paste0("Suggested as being female and with pregnancy status: ", filter(df, decision_woman==1 & ext_data_woman==1) |> nrow()))
filter(df, decision_woman==1 & ext_data_woman==1) |> pull(hash_key) ->> hashs_invalid_sex_woman_ask_pregnant
filter(df, decision_woman==0 & ext_data_woman==1) |> pull(hash_key) ->> hashs_invalid_sex_man_ask_pregnant
})()Suggested as being female and with pregnancy status: 20
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
infer_sex_icd10 <- function(icd_codes) {
# Enhanced female-specific ICD-10 patterns
female_patterns <- c(
"^O", # Pregnancy/childbirth (O00-O99)
"^C5[1-8]", # Female genital cancers (C51-C58)
"^D0[6-7]", # CIS female genital (D06-D07)
"^D2[4-8]", # Benign female neoplasms (D24-D28)
"^N7[0-7]", # PID (N70-N77)
"^N8[0-9]|^N9[0-8]", # Non-inflammatory disorders (N80-N98)
"^Q5[0-2]", # Congenital female anomalies (Q50-Q52)
"^Z12\\.4", # Female cancer screening
"^Z3[0-9]" # Reproductive health encounters (Z30-Z39)
)
# Enhanced male-specific ICD-10 patterns
male_patterns <- c(
"^N[4-5][0-9]", # Male genital disorders (N40-N51)
"^C6[0-3]", # Male genital cancers (C60-C63)
"^D29", # Benign male neoplasms
"^Q5[3-5]", # Congenital male anomalies (Q53-Q55)
"^Z12\\.5", # Prostate screening
"^Z41\\.2", # Vasectomy
"^Z90\\.7" # Acquired absence of male genital
)
# Check matches
is_female <- map_lgl(icd_codes, ~ any(stringr::str_detect(.x, female_patterns)))
is_male <- map_lgl(icd_codes, ~ any(stringr::str_detect(.x, male_patterns)))
case_when(
is_female & !is_male ~ "Female",
is_male & !is_female ~ "Male",
is_female & is_male ~ "Conflict",
TRUE ~ "nondet"
)
}
cat("Classification based of ICD-10 diagnoses in hospitalizations")Classification based of ICD-10 diagnoses in hospitalizations
Code
HOSP_filter_df|>
mutate(sex= infer_sex_icd10(diag1))|>
janitor::tabyl(sex) sex n percent
Female 52185 0.246163788
Male 1734 0.008179515
nondet 158074 0.745656696
Code
# sexo Female Male nondet
# 1 99 1734 109411
# 2 52086 0 48653
# 9 0 0 10
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
HOSP_filter_df|>
filter(run %in% setdiff(invalid_sex_ext_info_post_ask$hash_key, hashs_invalid_sex_woman_ask_pregnant))|>
mutate(sex= infer_sex_icd10(diag1))|>
(\(df) {
print(message(paste0("Hospital, suggested sex (w/o pregnancy status), Entries: ", nrow(df))))
print(message(paste0("Hospital, suggested sex (w/o pregnancy status), RUNs: ", distinct(df, run) |> nrow())))
df
})()|>
select(run, sex)|>
summarise(female_icd10= sum(sex=="Female", na.rm=T), male_icd10= sum(sex=="Male", na.rm=T), .by= run,.groups="drop_last")|>
filter(female_icd10>0|male_icd10>0)|>
(\(df) {
cat(paste0("Cases with a suggested sex requiring external confirmation, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: ", nrow(df),"\n"))
filter(df, female_icd10>0) |> pull(run) ->> hashs_invalid_sex_woman_ask_non_pregnant_but_icd10
filter(df, male_icd10>0) |> pull(run) ->> hashs_invalid_sex_man_ask_not_pregnant_but_icd10
})()NULL
NULL
Cases with a suggested sex requiring external confirmation, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: 12
Code
HOSP_filter_df|>
filter(run %in% setdiff(invalid_sex_ext_info_post_nondet$hash_key, hashs_invalid_sex_nondet_pregnant))|>
mutate(sex= infer_sex_icd10(diag1))|>
(\(df) {
print(message(paste0("Hospital, non-determined sex (w/o pregnancy status), Entries: ", nrow(df))))
print(message(paste0("Hospital, non-determined sex (w/o pregnancy status), RUNs: ", distinct(df, run) |> nrow())))
df
})()|>
select(run, sex)|>
summarise(female_icd10= sum(sex=="Female", na.rm=T), male_icd10= sum(sex=="Male", na.rm=T), .by= run,.groups="drop_last")|>
filter(female_icd10>0|male_icd10>0)|>
(\(df) {
cat(paste0("Cases with a non-determined sex, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: ", nrow(df),"\n"))
filter(df, female_icd10>0) |> pull(run) ->> hashs_invalid_sex_woman_nondet_non_pregnant_but_icd10
filter(df, male_icd10>0) |> pull(run) ->> hashs_invalid_sex_man_nondet_not_pregnant_but_icd10
})()NULL
NULL
Cases with a non-determined sex, lacking pregnancy information, but having available hospitalization records with ICD-10 diagnoses: 5
Code
HOSP_filter_df|>
filter(run %in% hashs_invalid_sex_man_ask_pregnant)|>
mutate(sex= infer_sex_icd10(diag1))|>
(\(df) {
print(message(paste0("Hospital, suggested sex male (w/ pregnancy status), Entries: ", nrow(df))))
print(message(paste0("Hospital, suggested sex male (w/ pregnancy status), RUNs: ", distinct(df, run) |> nrow())))
df
})()|>
#distinct(run, sex)|>
select(run, sex)|>
summarise(female_icd10= sum(sex=="Female", na.rm=T), male_icd10= sum(sex=="Male", na.rm=T), .by= run,.groups="drop_last")|>
filter(female_icd10>0|male_icd10>0)|>
(\(df) {
cat(paste0("Cases with a suggested Male value, with pregnancy information, but having available hospitalization records with ICD-10 diagnoses: ", nrow(df),"\n"))
filter(df, female_icd10>0) |> pull(run) ->> hashs_invalid_sex_man_woman_nondet_non_pregnant_but_icd10
filter(df, male_icd10>0) |> pull(run) ->> hashs_invalid_sex_man_man_nondet_not_pregnant_but_icd10
})()NULL
NULL
Cases with a suggested Male value, with pregnancy information, but having available hospitalization records with ICD-10 diagnoses: 0
Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Debería añadir el sexo aquí")
#invalid_sex_c1_2324_idgen
invalid_sex_ext_info_post_hosp<-
invalid_sex_ext_info_post|>
mutate(decision_post= case_when(hash_key %in%
hashs_invalid_sex_woman_ask_non_pregnant_but_icd10~ "6.c.1.4.a.ask_confirmed_icd10_female",
hash_key %in% hashs_invalid_sex_man_ask_not_pregnant_but_icd10~ "6.c.1.4.b.ask_confirmed_icd10_male",
hash_key %in% hashs_invalid_sex_woman_nondet_non_pregnant_but_icd10~ "6.c.1.5.a.nondet_confirmed_icd10_female",
hash_key %in% hashs_invalid_sex_man_nondet_not_pregnant_but_icd10~ "6.c.1.5.a.nondet_confirmed_icd10_male",
hash_key %in% hashs_invalid_sex_man_woman_nondet_non_pregnant_but_icd10~ "6.c.1.6.a.nondet_inconsistent_icd10_female",
hash_key %in% hashs_invalid_sex_man_man_nondet_not_pregnant_but_icd10~ "6.c.1.5.6.nondet_inconsistent_icd10_male",
hash_key %in% hashs_invalid_sex_woman_ask_pregnant~ "6.c.1.a.ask_confirmed_female",
hash_key %in% hashs_invalid_sex_man_ask_pregnant~ "6.c.1.b.ask_confirmed_male",
hash_key %in% hashs_invalid_sex_nondet_pregnant~ "6.c.2.nondet_female",
hash_key %in% hashs_invalid_sex_nondet_pregnant~ "6.c.3.ask_confirmed_female",T~ decision)) |> select(hash_key, decision_post)#decision
table(invalid_sex_ext_info_post_hosp$decision_post)|>
data.frame()|>
arrange(desc(Freq))|>
knitr::kable("markdown", caption="Decision after using hospitalization data")| Var1 | Freq |
|---|---|
| 6.a.a.1.female | 139 |
| 6.a.b.1.female | 125 |
| 6.a.a.2.male | 43 |
| 6.a.b.2.male, but ask aux data | 32 |
| 6.a.b.2.male | 27 |
| 6.a.c.nondet | 27 |
| 6.c.1.a.ask_confirmed_female | 20 |
| 6.c.2.nondet_female | 16 |
| 6.a.b.3.nondet | 14 |
| 6.b.a.3.nondet | 11 |
| 6.c.1.4.a.ask_confirmed_icd10_female | 11 |
| 6.b.b.1.male | 10 |
| 6.a.b.1.female, but ask aux data | 9 |
| 6.c.1.b.ask_confirmed_male | 6 |
| 6.c.1.5.a.nondet_confirmed_icd10_female | 5 |
| 6.a.b.5.male, but ask aux data | 2 |
| 6.b.a.2.male | 1 |
| 6.c.1.4.b.ask_confirmed_icd10_male | 1 |
| no sé | 1 |
Undetermined sex classifications were retained, recognizing that for these records, inferring sex from external data might be unreliable. This category could include cases where inference is difficult, as well as potential instances of gender transition.
Code
c1_inconsistent_sex_genid_10_21<-
SISTRAT23_c1_2010_2022_df_prev1o|>
filter(hash_key %in% invalid_sex_ext_info_post_hosp$hash_key) |>
select(hash_key, identidad_de_genero) |>
group_by(hash_key)|>
summarise(prop_fem_genid_1021=sum(grepl("fem",identidad_de_genero),na.rm=T)/sum(!is.na(identidad_de_genero), na.rm=T))|>
ungroup()
invalid_sex_ext_info_post_hosp_c1_10_24<-
invalid_sex_ext_info_post_hosp|>
left_join(c1_inconsistent_sex_genid_10_21,by="hash_key")|>
left_join(invalid_sex_c1_2324_idgen,by="hash_key")
SISTRAT23_c1_2010_2022_df_prev1p<-
SISTRAT23_c1_2010_2022_df_prev1o|>
(\(df) {
cat(paste0("6.Number of cases before resolving inconsistencies in sex: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("6.Number of patients before resolving inconsistencies in sex: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
left_join(invalid_sex_ext_info_post_hosp_c1_10_24, multiple="first")|>
mutate(sex_rec= case_when(grepl("det$",decision_post)~ sexo, grepl("female", decision_post)~ "mujer", grepl("male", decision_post)~ "hombre", T~ sexo))|>
#filter(is.na(sex_rec), !is.na(c1_2324_genid_1))|>
#select(hash_key, sexo, decision_post, prop_fem_genid_1021, c1_2324_genid_1, c1_2324_genid_2)|>
mutate(sex_rec= case_when(is.na(sex_rec) & (prop_fem_genid_1021>.5|grepl("fem", c1_2324_genid_1))~ "mujer", T~ sex_rec))|>
mutate(sex_rec= case_when(sex_rec=="mujer"~ "female", sex_rec=="hombre"~ "male", T~ NA_character_))|>
#janitor::tabyl(sex_rec, sexo)
mutate(OBS= case_when(!is.na(decision_post)~ paste0(as.character(OBS),";",decision_post), T~ OBS))|>
select(-decision_post)|>
(\(df) {
cat(paste0("6. After after resolving inconsistencies in sex, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("6. After after resolving inconsistencies in sex, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1o))stop("Error: Added treatment episodes in the process")
df
})() |>
select(-prop_fem_genid_1021, -c1_2324_genid_1, -c1_2324_genid_2)6.Number of cases before resolving inconsistencies in sex: 150,046
6.Number of patients before resolving inconsistencies in sex: 106,283
6. After after resolving inconsistencies in sex, obs.: 150,046
6. After after resolving inconsistencies in sex, RUNs: 106,283
The database resulting from these changes was named SISTRAT23_c1_2010_2022_df_prev1p, and the new variable containing the recoded sex information is called sex_rec.
2.2. Nationality
- Nationality (nacionalidad) (n= 119). We created a column called
nationallity_consshowing what those inconsistent nationalities are for each affected patient.
Code
invisible("no tiene perdidos, la mayoría son de chile. Por tanto, si es distinto a Chile, reemplazarlo")
invalid_nationality_by_patient<-
SISTRAT23_c1_2010_2022_df_prev1p|> group_by(hash_key)|> summarise(nacionalidades_por_hash = n_distinct(nacionalidad), distinto_chile = sum(nacionalidad!="chile", na.rm=T), tot_obs = n())|> ungroup()|> mutate(perc_extranjero = distinto_chile/tot_obs)|> filter(nacionalidades_por_hash>1)|> pull(hash_key)
multiple_nationalities<-
SISTRAT23_c1_2010_2022_df_prev1p|> select(hash_key, nacionalidad)|> filter(hash_key %in% invalid_nationality_by_patient)|>
summarise(nacionalidad_distinct = paste(sort(unique(nacionalidad)), collapse = "; "), .by="hash_key")|>
#mutate(rnnac= row_number(),.by="hash_key")|> pivot_wider(names_from="rnnac", values_from="nacionalidad")
ungroup()Code
invisible("This database is useless. We cant obtain information because there is no 1:1 linkage")
hosp_un_inv_2<-
rio::import(paste0(gsub("/cons","/data/20231205_original_data",getwd()),"/EH_2010_2022_Pasantes_v2_encrip.csv"))
#hosp_un_inv_2[,c("RUN", "ESTAB_HOMO", "FECHA_INGRESO_FMT_DEIS", "FECHA_EGRESO_FMT_DEIS", "SEXO", "EDAD_ANOS", "DIAG1", "DIAG2", "COND_EGR")]
hosp_un_inv_2_df<- hosp_un_inv_2 %>%
mutate(DIAG2= ifelse(nchar(DIAG2)<2, NA_character_, DIAG2)) %>%
mutate(
KEY = paste(ESTAB_HOMO, FECHA_INGRESO_FMT_DEIS, FECHA_EGRESO_FMT_DEIS,
SEXO, EDAD_ANOS, DIAG1, DIAG2, COND_EGR, sep = "|")
)
#HOSP_filter_df[, c("run", "estab_homo", "fecha_ingreso", "fecha_egreso", "sexo", "edad_anos", "diag1", "diag2", "cond_egr")]
HOSP_filter_df<- HOSP_filter_df %>%
mutate(
KEY = paste(estab_homo, fecha_ingreso, fecha_egreso,
sexo, edad_anos, diag1, diag2, cond_egr, sep = "|")
)
HOSP_filter_df_join_KEY_more_one<-
HOSP_filter_df|>
inner_join(hosp_un_inv_2_df, by="KEY")|>
group_by(KEY)|>
count()|>
ungroup()|>
filter(n>1)
HOSP_filter_df_join_KEY_only_one<-
HOSP_filter_df|>
inner_join(hosp_un_inv_2_df, by="KEY")|>
group_by(KEY)|>
mutate(n=n())|>
ungroup()|>
filter(n==1)
# HOSP_filter_df|>
# inner_join(hosp_un_inv_2_df, by="KEY")|>
# filter(GLOSA_PAIS_ORIGEN!="")|>
# #select(run, GLOSA_PAIS_ORIGEN)|>
# distinct(run, GLOSA_PAIS_ORIGEN)|>
# ungroup()|>
# group_by(run)|>
# mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
# pivot_wider(names_from = id, values_from = GLOSA_PAIS_ORIGEN,
# names_prefix = "hosp_nat_")2.3. Starting substance
Starting Substance (first_sub_used) (n= 13,881). For users that had only two treatments but a different starting substance, or in cases or users that had ties within most recent database or within the most recent value, we added a second and a third variable called sus_ini_2 and sus_ini_3 that contains a second starting substance. We also made sus_ini_mvv for starting substances of the most vulnerable value reported (Paste Base > Cocaine hydrochloride > Alcohol > Marijuana > Other).
Code
invalid_start_subs_hash_key<-
SISTRAT23_c1_2010_2022_df_prev1p|> summarise(sus_ini_por_hash = n_distinct(first_sub_used),.by=hash_key, .groups="drop_last")|> filter(sus_ini_por_hash>1)|> pull(hash_key)
cat("Number of distinct starting substances\n")
SISTRAT23_c1_2010_2022_df_prev1p|> summarise(sus_ini_por_hash = n_distinct(first_sub_used),.by=hash_key, .groups="drop_last")|> filter(sus_ini_por_hash>1)|> pull(sus_ini_por_hash)|> summary()
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("======================================================")
invalid_start_subs_c2<-
CONS_C2 |>
filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
(\(df) {
print(message(paste0("C2, Entries: ", nrow(df))))
print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sustancia_inicial)|>
rename("start_sub"=2)|>
tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = start_sub,
names_prefix = "c2_susini_")Code
invisible("======================================================")
invalid_start_subs_c3<-
CONS_C3 |>
filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
(\(df) {
print(message(paste0("C2, Entries: ", nrow(df))))
print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sustanciade_inicio)|>
rename("start_sub"=2)|>
tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = start_sub,
names_prefix = "c3_susini_")Code
invisible("======================================================")
invalid_start_subs_c4<-
CONS_C4 |>
filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
(\(df) {
print(message(paste0("C2, Entries: ", nrow(df))))
print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sustanciadeinicio)|>
rename("start_sub"=2)|>
tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = start_sub,
names_prefix = "c4_susini_")Code
invisible("======================================================")
invalid_start_subs_c5<-
CONS_C5 |>
filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
(\(df) {
print(message(paste0("C2, Entries: ", nrow(df))))
print(message(paste0("C2, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sustancia_inicial)|>
rename("start_sub"=2)|>
tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = start_sub,
names_prefix = "c5_susini_")Code
invisible("======================================================")
invalid_start_subs_c6<-
CONS_C6|>
filter(HASH_KEY %in% invalid_start_subs_hash_key)|>
(\(df) {
print(message(paste0("C6, Entries: ", nrow(df))))
print(message(paste0("C6, RUNs: ", distinct(df, HASH_KEY)|> nrow())))
df
})()|>
distinct(HASH_KEY, sustanciadeinicio)|>
rename("start_sub"=2)|>
tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
ungroup()|>
group_by(HASH_KEY)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = start_sub,
names_prefix = "c6_susini_")Code
invisible("======================================================")
invalid_start_subs_c2_2324<-
c2_2324 |>
filter(hashkey %in% invalid_start_subs_hash_key)|>
(\(df) {
print(message(paste0("C2,23-24, Entries: ", nrow(df))))
print(message(paste0("C2,23-24, RUNs: ", distinct(df, hashkey)|> nrow())))
df
})()|>
mutate(sustancia_inicial= tolower(sustancia_inicial))|>
distinct(hashkey, sustancia_inicial)|>
rename("start_sub"=2)|>
tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
ungroup()|>
group_by(hashkey)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = start_sub,
names_prefix = "c2_susini2324_")Code
invisible("======================================================")
invalid_start_subs_c1_2324<-
SISTRAT23_c1_2023_2024_df2|>
filter(hash_key %in% invalid_start_subs_hash_key)|>
(\(df) {
print(message(paste0("C1,23-24, Entries: ", nrow(df))))
print(message(paste0("C1,23-24, RUNs: ", distinct(df, hash_key)|> nrow())))
df
})()|>
distinct(hash_key, sustancia_de_inicio)|>
rename("start_sub"=2)|>
tidytable::mutate(start_sub= tidytable::case_when(grepl("coca",start_sub)~ "cocaine powder", grepl("crack|pasta",start_sub)~ "cocaine paste", grepl("marihuana",start_sub)~ "marijuana", grepl("anfeta|extasis|fenil|estimul",start_sub)~ "amphetamine-type stimulants", grepl("alucin|lsd|hongos",start_sub)~ "hallucinogens", grepl("opi|hero|metadona",start_sub)~ "opioids", grepl("sedante|hipnotico|tranquiliz",start_sub)~ "tranquilizers/hypnotics", grepl("inhalable",start_sub)~ "inhalants", grepl("esteroid|otros",start_sub)~"others", grepl("especif|cip-crc|sin consumo",start_sub)~ NA_character_, TRUE~start_sub))|>
ungroup()|>
group_by(hash_key)|>
mutate(id = as.character(dplyr::row_number()))|> # Convertir `id` a carácter
pivot_wider(names_from = id, values_from = start_sub,
names_prefix = "c1_2224_susini2324_")Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("Generar jerarquías de sustancias de inicio")
#attr(table(SISTRAT23_c1_2010_2022_df_prev1p$first_sub_used),"names") |> dput()
substances<- c("alcohol", "amphetamine-type stimulants", "cocaine paste",
"cocaine powder", "hallucinogens", "inhalants", "marijuana",
"opioids", "others", "tranquilizers/hypnotics")
cat("Make counts by RUN and top substances in case of more than one initial substance\n")
invalid_start_subs_ext_info<-
SISTRAT23_c1_2010_2022_df_prev1p|> #00068c7eed2a6c21c8750f250b601cbfe29262728726655a0958c49ce64667d0 $ first_sub_used <chr> NA, "others", "alcohol"
tidytable::filter(hash_key %in% invalid_start_subs_hash_key)|>
select(hash_key, first_sub_used)|>
group_by(hash_key)|>
summarise(
alcohol = sum(first_sub_used == "alcohol", na.rm=T),
amphetamine_type_stimulants = sum(first_sub_used == "amphetamine-type stimulants", na.rm=T),
cocaine_paste = sum(first_sub_used == "cocaine paste", na.rm=T),
cocaine_powder = sum(first_sub_used == "cocaine powder", na.rm=T),
hallucinogens = sum(first_sub_used == "hallucinogens", na.rm=T),
inhalants = sum(first_sub_used == "inhalants", na.rm=T),
marijuana = sum(first_sub_used == "marijuana", na.rm=T),
opioids = sum(first_sub_used == "opioids", na.rm=T),
others = sum(first_sub_used == "others", na.rm=T),
tranquilizers_hypnotics = sum(first_sub_used == "tranquilizers/hypnotics", na.rm=T),
total = n(), # total records per hash_key
.groups = "drop"
)|>
tidylog::left_join(invalid_start_subs_c2, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_start_subs_c3, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_start_subs_c4, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_start_subs_c5, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_start_subs_c6, by=c("hash_key"="HASH_KEY"), multiple="first")|>
tidylog::left_join(invalid_start_subs_c1_2324, by=c("hash_key"="hash_key"), multiple="first")|>
tidylog::left_join(invalid_start_subs_c2_2324, by=c("hash_key"="hashkey"), multiple="first")|>
(\(df) {
# 1) coerce to a plain data.frame so base R subsetting works
df2 <- as.data.frame(df)
# 2) find and lowercase your susini columns
susini_cols <- grep("^c[2-6]_susini_\\d+$", names(df2), value = TRUE)
df2[susini_cols] <- lapply(df2[susini_cols], tolower)
# 3) for each substance, add a new “n_<substance>” column
for (sub in substances) {
safe_nm <- paste0("n_", make.names(sub))
df2[[safe_nm]] <- rowSums(df2[susini_cols] == sub, na.rm = TRUE)
}
# 4) now build the full list of count-columns to rank
base_counts <- c(
"alcohol",
"amphetamine_type_stimulants",
"cocaine_paste",
"cocaine_powder",
"hallucinogens",
"inhalants",
"marijuana",
"opioids",
"others",
"tranquilizers_hypnotics"
)
row_counts <- paste0("n_", make.names(substances))
safe_names <- c(base_counts, row_counts)
# 5) mapping back to the human labels
all_labels <- c(substances, substances)
names(all_labels) <- safe_names
# 6) compute top 3 over these combined counts
top3 <- t(apply(df2[safe_names], 1, function(x) {
# only positive counts
x_pos <- x[x > 0]
if (length(x_pos) == 0) return(rep(NA_character_, 3))
ord <- sort(x_pos, decreasing = TRUE)
nm3 <- names(ord)[seq_len(min(3, length(ord)))]
# map back & pad with NAs
c(all_labels[nm3], rep(NA_character_, 3 - length(nm3)))
}))
colnames(top3) <- paste0("sus_ini_", 1:3, "_mod")
df2 <- cbind(df2, as.data.frame(top3, stringsAsFactors = FALSE))
df2
})()Number of distinct starting substances
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.000 2.000 2.144 2.000 5.000
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
Make counts by RUN and top substances in case of more than one initial substance
Now we generated the criteria of the most vulnerable variable.
Code
invisible("buscar criterio de mvv")
SISTRAT23_c1_2010_2022_df_prev1q<-
SISTRAT23_c1_2010_2022_df_prev1p|>
(\(df) {
cat(paste0("7.Number of cases before adding inconsistencies in nationallity & starting substance: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("7.Number of patients before adding inconsistencies in nationallity & starting substance: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
left_join(multiple_nationalities, multiple="first")|>
mutate(nationallity_cons= case_when(!is.na(nacionalidad_distinct)~ nacionalidad_distinct, T~ nacionalidad))|>
select(-nacionalidad_distinct)|>
(\(df) {
cat(paste0("7.Number of cases before adding inconsistencies in starting substance: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("7.Number of patients before adding inconsistencies in starting substance: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
df
})()|>
left_join(invalid_start_subs_ext_info[, c("hash_key",paste0("sus_ini_",1:3,"_mod"))], multiple="first")|>
mutate(sus_ini_mod_pb=case_when(grepl("past",first_sub_used) & is.na(sus_ini_1_mod)~"cocaine paste",
grepl("past",sus_ini_3_mod)~"cocaine paste",
grepl("past",sus_ini_2_mod)~"cocaine paste",
grepl("past",sus_ini_1_mod)~"cocaine paste",
TRUE~NA_character_))|>
mutate(sus_ini_mod_oh=case_when(grepl("alcohol",first_sub_used) & is.na(sus_ini_1_mod)~"alcohol", grepl("alcohol",sus_ini_3_mod)~"alcohol",
grepl("alcohol",sus_ini_2_mod)~"alcohol",
grepl("alcohol",sus_ini_1_mod)~"alcohol",
TRUE~NA_character_))|>
mutate(sus_ini_mod_coc=case_when(grepl("powder",first_sub_used) & is.na(sus_ini_1_mod)~"cocaine powder",
grepl("powder",sus_ini_3_mod)~"cocaine powder",
grepl("powder",sus_ini_2_mod)~"cocaine powder",
grepl("powder",sus_ini_1_mod)~"cocaine powder",
TRUE~NA_character_))|>
mutate(sus_ini_mod_mar=case_when(grepl("marij",first_sub_used) & is.na(sus_ini_1_mod)~"marijuana",
grepl("marij",sus_ini_3_mod)~"marijuana",
grepl("marij",sus_ini_2_mod)~"marijuana",
grepl("marij",sus_ini_1_mod)~"marijuana",
TRUE~NA_character_))|>
mutate(sus_ini_mod_otr=case_when(!grepl("alcohol|past|powder|marij",first_sub_used) & !is.na(first_sub_used) & is.na(sus_ini_1_mod)~ "others",
!grepl("alcohol|past|powder|marij",sus_ini_3_mod) & !is.na(sus_ini_3_mod)~"others",
!grepl("alcohol|past|powder|marij",sus_ini_2_mod) & !is.na(sus_ini_2_mod)~"others",
!grepl("alcohol|past|powder|marij",sus_ini_1_mod) & !is.na(sus_ini_1_mod)~"others",
TRUE~NA_character_))|>
mutate(sus_ini_mod_mvv=case_when(grepl("past", sus_ini_mod_pb)~"cocaine paste",
grepl("powder",sus_ini_mod_coc)~"cocaine powder",
grepl("marijuana",sus_ini_mod_mar)~"marijuana",
grepl("alcohol",sus_ini_mod_oh)~"alcohol",
grepl("oth",sus_ini_mod_otr)~"others",
TRUE~NA_character_))|>
#janitor::tabyl(sus_ini_mod_mvv)
mutate(sus_ini_mod_mvv=factor(sus_ini_mod_mvv,labels=c("alcohol",
"cocaine powder",
"marijuana",
"others",
"cocaine paste")))|>
select(-any_of(c("sus_ini_mod_pb", "sus_ini_mod_oh", "sus_ini_mod_coc", "sus_ini_mod_mar", "sus_ini_mod_otr")))|>
(\(df) {
cat(paste0("7. After after resolving inconsistencies in nationallity & starting substance, obs.: ", formatC(nrow(df), big.mark=",")),"\n")
cat(paste0("7. After after resolving inconsistencies in nationallity & starting substance, RUNs: ", formatC(nrow(distinct(df, hash_key)), big.mark=",")),"\n")
if (nrow(df) > nrow(SISTRAT23_c1_2010_2022_df_prev1p))stop("Error: Added treatment episodes in the process")
df
})() |>
select(-sus_ini_1_mod, -sus_ini_2_mod, -sus_ini_3_mod)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#starting substance
#Segunda Sustancia de Inicio(Sólo más frecuentes) Second Starting Substance
#Tercera Sustancia de Inicio(Sólo más frecuentes) Third Starting Substance
#Sustancia de Inicio (Sólo más frecuentes) Starting Substance (Only more frequent)7.Number of cases before adding inconsistencies in nationallity & starting substance: 150,046
7.Number of patients before adding inconsistencies in nationallity & starting substance: 106,283
7.Number of cases before adding inconsistencies in starting substance: 150,046
7.Number of patients before adding inconsistencies in starting substance: 106,283
7. After after resolving inconsistencies in nationallity & starting substance, obs.: 150,046
7. After after resolving inconsistencies in nationallity & starting substance, RUNs: 106,283
Eliminate to comply with ethic concerns
Code
#Primero, se eliminará completamente la columna "Código Identificación
#de SENDA", ya que contiene información potencialmente vulnerable basada en atributos
#personales. En segundo lugar, dado que la identificación del centro de tratamiento podría
#facilitar la reidentificación indirecta de algunos pacientes, esta variable será sometida a
#un proceso de encriptación irreversible utilizando el paquete estadístico “sodium” (v.1.4.0; Ooms, J., 2024).To close the project, we erase polars objects.
Code
rm(list = ls()[grepl("_pl$", ls())])Session info
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))Code
message(paste0("Editor context: ", path))Code
cat("quarto version: "); quarto::quarto_version()quarto version:
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/Rtmpa25KUs/file13d385ce5280e -V’ tiene el estatus 1
Code
dplyr::select(
tibble::as_tibble(sesion_info$packages),
c(package, loadedversion, source)
) %>%
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('R packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}")))Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#| class-output: center-table
reticulate::py_list_packages() %>%
DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'', htmltools::em('Python packages')),
options=list(
initComplete = htmlwidgets::JS(
"function(settings, json) {",
"$(this.api().tables().body()).css({
'font-family': 'Helvetica Neue',
'font-size': '70%',
'code-inline-font-size': '15%',
'white-space': 'nowrap',
'line-height': '0.75em',
'min-height': '0.5em'
});",
"}"))) Error in path.expand(path): argumento ‘path’ inválido
Save
Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))
# Save
rdata_path <- file.path(wdpath, "data/20241015_out", paste0("4_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))
save.image(rdata_path)
cat("Saved in:",
rdata_path)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
password <- Sys.getenv("PASSWORD_SECRET")
} else {
if (interactive()) {
utils::savehistory(tempfile())
Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
utils::loadhistory()
}
Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
save.image(paste0(rdata_path,".enc"))
# Encriptar el archivo en el mismo lugar
httr2::secret_encrypt_file(path = paste0(rdata_path,".enc"), key = "PASSWORD_SECRET")
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Copy renv lock into cons folder\n")
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
message("Running on RStudio Server or inside Docker. Folder copy skipped.")
} else {
source_folder <-
destination_folder <- paste0(wdpath,"cons/renv")
# Copy the folder recursively
file.copy(paste0(wdpath,"renv.lock"), paste0(wdpath,"cons/renv.lock"), overwrite = TRUE)
message("Renv lock copy performed.")
}Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup2<-Sys.time()
paste0("Time in markdown: ");time_after_dedup2-time_before_dedup2[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/4_ndp_2025_06_06.RdataCopy renv lock into cons folder
[1] "Time in markdown: "
Time difference of 9.14165 mins